# Tableau Project (Criterio 5: df_culture)

In [None]:
#Importo librerias.
import pymongo
import pandas as pd
import re

#Creo cursor y dataframe inicial.
MongoClient = pymongo.MongoClient
client = MongoClient()
db = client.companies
cursor = db.companies.find()
data = list(cursor)
df = pd.DataFrame(data)

In [2]:
#Análisis de atributos
col_float = ['deadpooled_day','deadpooled_month','deadpooled_year','founded_day','founded_month',
             'founded_year','number_of_employees']
col_obj = ['_id','acquisition','acquisitions','alias_list','blog_feed_url','blog_url','category_code',
           'competitions','created_at','crunchbase_url','deadpooled_url','description','email_address',
        'external_links','funding_rounds','homepage_url','image','investments','ipo','milestones', 
           'name','offices','overview','partners','permalink','phone_number','products','providerships',
        'relationships','screenshots','tag_list','total_money_raised','twitter_username','updated_at',
           'video_embeds']
col_drop1 = ['deadpooled_day','deadpooled_month','founded_day','founded_month','alias_list','blog_feed_url',
             'blog_url','created_at','crunchbase_url','deadpooled_url','email_address','external_links',
            'image','permalink','phone_number','screenshots','tag_list','updated_at','video_embeds']
col_drop2 = ['acquisition','acquisitions','competitions','description','funding_rounds','homepage_url',
             'investments','ipo','milestones','partners','products','providerships','relationships',
             'twitter_username','overview',]
col_ok = ['deadpooled_year','founded_year','number_of_employees','_id','category_code','name','offices',
          'total_money_raised']

In [3]:
#Primera fase de limpieza de datos.
df_drop1 = df.drop(col_drop1, axis=1)
df_drop2 = df_drop1.drop(col_drop2, axis=1)

#Eliminación de registros correspondientes a empresas en 'deadpool'.
df_deadnull = df_drop2[df_drop2['deadpooled_year'].isnull()]

#Eliminación de registros con valor 'null'
df_nulls = df_deadnull.dropna(subset=['founded_year','number_of_employees','name','offices',
          'total_money_raised'])
#Verificación de la longitud del nuevo dataset.
print(len(df_nulls))

7934


In [4]:
#Elección de columnas relevantes para el criterio.
col_cat = ['games_video','web','software']
df_cat = df_nulls[df_nulls['category_code'].isin(col_cat)]\
.drop(['_id','deadpooled_year'], axis=1)

#Verificación de la longitud del nuevo dataset.
print(len(df_cat))

3752


In [5]:
#Definición de funciones para modificación de tipo de datos.

#Transforma un dato tipo float a uno tipo int.
def float_to_int(flt):
    integer = int(flt)
    return integer

#Transforma un dato tipo str en int devolviendo solo el número.
def str_to_int(strg):
    string = re.findall('\d+', strg )
    integer = int(string[0])*1000
    return integer

In [6]:
#Aplicación de funciones sobre los registros.
df_regout = df_cat.copy()
df_regout['total_money_raised'] = df_regout['total_money_raised'].apply(str_to_int)
df_regout['founded_year'] = df_regout['founded_year'].apply(float_to_int)

#Eliminación de registros irrelevantes según criterio.
#df_regout = df_regout[df_regout.total_money_raised == 0]
#df_regout = df_regout[df_regout.number_of_employees != 0]
df_regout = df_regout[df_regout.founded_year >= 2009]
df_regout = df_regout[df_regout['offices'].map(len) > 0]

#Verificación de la longitud del nuevo dataset.
print(len(df_regout))

175


In [7]:
#Función para la creación de los valores de ubicación en formato GeoJson.
def nested_to_list(dicts):
    lat_lon = [{"type":"Point","coordinates":[l['longitude'],l['latitude']]} for l in dicts]
    lst = lat_lon[0]
    long = lst['coordinates'][0]
    if long != None:
        return lst
    else:
        return 0

In [8]:
#Aplicación de función para la creación de los valores de ubicación en formato GeoJson.
df_colout = df_regout.copy()
df_colout['offices'] = df_colout['offices'].apply(nested_to_list)

In [9]:
#Eliminación de valores de GeoJson nulos.
df_geoout = df_colout.copy()
df_geoout = df_geoout[df_geoout.offices != 0]

#Verificación de la longitud del nuevo dataset.
print(len(df_geoout))

#Vista del dataset definitivo antes de generar .json
display(df_geoout.head())

128


Unnamed: 0,category_code,founded_year,name,number_of_employees,offices,total_money_raised
552,web,2012,headr,8.0,"{'type': 'Point', 'coordinates': [13.4109071, ...",0
612,web,2013,Fixya,30.0,"{'type': 'Point', 'coordinates': [-122.323895,...",8000
685,games_video,2009,alluc,7.0,"{'type': 'Point', 'coordinates': [10.023246, 5...",0
1424,games_video,2011,Social Gaming Network,100.0,"{'type': 'Point', 'coordinates': [-122.161523,...",17000
1898,games_video,2009,Crootpad,2.0,"{'type': 'Point', 'coordinates': [-88.288749, ...",0


In [10]:
#Creación del fichero .json para generar nueva 'collection' en MongoDB
df_geoout.to_json('culture.json', orient="records", lines=True)

### Se importa .json en MongoDB para generar collection y aplicar operador geoespacial '$near'
Nombre de la collection creada: **culture**

In [27]:
#Se define función para aplicar operador geoespacial '$near'
def concentration(company):
    cursor_near = db.culture.find({
      "offices": {
        "$near": {
          "$geometry": {
            "type": "Point",
            "coordinates": [company['offices']['coordinates'][0], company['offices']['coordinates'][1]]
          },
          "$minDistance": 0,
          "$maxDistance":5000
        }
      }
    })
    return list(cursor_near)

In [28]:
#Aplicación de la función para cada documento de la colección respecto al resto de documentos
cursor_geojson = db.culture.find()
clusters = []
for element in cursor_geojson:
    clusters.append(len(concentration(element)))

#Verificación de la longitud del cursor (total de iteraciones).
print(len(clusters),'\n')

#Visualización del total de coincidencias para cada documento.
print(clusters,'\n')

#Máximo número de coincidencias.
print(max(clusters),'\n')

128 

[4, 1, 4, 3, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 8, 2, 1, 5, 10, 2, 1, 8, 3, 1, 1, 1, 1, 2, 1, 7, 2, 2, 2, 1, 1, 8, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 2, 1, 1, 1, 1, 8, 2, 3, 1, 1, 2, 2, 1, 1, 1, 5, 4, 1, 3, 1, 4, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 2, 1, 1, 1, 10, 1, 1, 9, 1, 1, 6, 5, 1, 2, 5, 4, 3, 1, 1, 1, 2, 5, 5, 1, 1, 1, 2, 1, 10, 1, 2, 1, 2, 1, 10, 2, 2] 

10 



In [29]:
#Extracción de coordenadas geográficas con mayor número de coincidencias dentro del criterio.
target = list(db.culture.find())
targets = []
for i in range(len(clusters)):
    if clusters[i] == max(clusters):
        targets.append([target[i]['offices']['coordinates'][0], target[i]['offices']['coordinates'][1]])

#Visualización de los pares de coordenadas resultantes.
print(targets)

[[-73.987764, 40.744618], [-73.9918181, 40.7489381], [-74.00717, 40.7408042], [-73.9918181, 40.7489381]]


In [30]:
#Creación de dataframe para Plateau.
long_lat = ['Longitude','Latitude']
df_culture = pd.DataFrame(targets, columns=long_lat)
df_culture['Criteria'] = 'Culture'

#Visualización del dataframe final dentro del criterio.
display(df_culture)

Unnamed: 0,Longitude,Latitude,Criteria
0,-73.987764,40.744618,Culture
1,-73.991818,40.748938,Culture
2,-74.00717,40.740804,Culture
3,-73.991818,40.748938,Culture


In [31]:
#Creación de fichero .json para visualización en Plateau.
df_culture.to_json('df_culture.json', orient="records", lines=True)