# Tableau Project (Criterio 2: df_cost)

In [3]:
#Importo librerias.
import pymongo
import pandas as pd
import re

#Creo cursor y dataframe inicial.
MongoClient = pymongo.MongoClient
client = MongoClient()
db = client.companies
cursor = db.companies.find()
data = list(cursor)
df = pd.DataFrame(data)

In [4]:
#Análisis de atributos
col_float = ['deadpooled_day','deadpooled_month','deadpooled_year','founded_day','founded_month',
             'founded_year','number_of_employees']
col_obj = ['_id','acquisition','acquisitions','alias_list','blog_feed_url','blog_url','category_code',
           'competitions','created_at','crunchbase_url','deadpooled_url','description','email_address',
        'external_links','funding_rounds','homepage_url','image','investments','ipo','milestones', 
           'name','offices','overview','partners','permalink','phone_number','products','providerships',
        'relationships','screenshots','tag_list','total_money_raised','twitter_username','updated_at',
           'video_embeds']
col_drop1 = ['deadpooled_day','deadpooled_month','founded_day','founded_month','alias_list','blog_feed_url',
             'blog_url','created_at','crunchbase_url','deadpooled_url','email_address','external_links',
            'image','permalink','phone_number','screenshots','tag_list','updated_at','video_embeds']
col_drop2 = ['acquisition','acquisitions','competitions','description','funding_rounds','homepage_url',
             'investments','ipo','milestones','partners','products','providerships','relationships',
             'twitter_username','overview',]
col_ok = ['deadpooled_year','founded_year','number_of_employees','_id','category_code','name','offices',
          'total_money_raised']

In [5]:
#Primera fase de limpieza de datos.
df_drop1 = df.drop(col_drop1, axis=1)
df_drop2 = df_drop1.drop(col_drop2, axis=1)

#Eliminación de registros correspondientes a empresas en 'deadpool'.
df_deadnull = df_drop2[df_drop2['deadpooled_year'].isnull()]

#Eliminación de registros con valor 'null'
df_nulls = df_deadnull.dropna(subset=['founded_year','number_of_employees','name','offices',
          'total_money_raised'])
#Verificación de la longitud del nuevo dataset.
print(len(df_nulls))

7934


In [6]:
#Elección de columnas relevantes para el criterio.
col_cat = ['games_video','web','software']
df_cat = df_nulls[df_nulls['category_code'].isin(col_cat)]\
.drop(['_id','deadpooled_year'], axis=1)

#Verificación de la longitud del nuevo dataset.
print(len(df_cat))

3752


In [7]:
#Definición de funciones para modificación de tipo de datos.

#Transforma un dato tipo float a uno tipo int.
def float_to_int(flt):
    integer = int(flt)
    return integer

#Transforma un dato tipo str en int devolviendo solo el número.
def str_to_int(strg):
    string = re.findall('\d+', strg )
    integer = int(string[0])*1000
    return integer

In [8]:
#Aplicación de funciones sobre los registros.
df_regout = df_cat.copy()
df_regout['total_money_raised'] = df_regout['total_money_raised'].apply(str_to_int)
df_regout['founded_year'] = df_regout['founded_year'].apply(float_to_int)

#Eliminación de registros irrelevantes según criterio.
df_regout = df_regout[df_regout.total_money_raised == 0]
df_regout = df_regout[df_regout.number_of_employees != 0]
df_regout = df_regout[df_regout['offices'].map(len) > 0]

#Verificación de la longitud del nuevo dataset.
print(len(df_regout))

2289


In [9]:
#Función para la creación de los valores de ubicación en formato GeoJson.
def nested_to_list(dicts):
    lat_lon = [{"type":"Point","coordinates":[l['longitude'],l['latitude']]} for l in dicts]
    lst = lat_lon[0]
    long = lst['coordinates'][0]
    if long != None:
        return lst
    else:
        return 0

In [10]:
#Aplicación de función para la creación de los valores de ubicación en formato GeoJson.
df_colout = df_regout.copy()
df_colout['offices'] = df_colout['offices'].apply(nested_to_list)

In [11]:
#Eliminación de valores de GeoJson nulos.
df_geoout = df_colout.copy()
df_geoout = df_geoout[df_geoout.offices != 0]

#Verificación de la longitud del nuevo dataset.
print(len(df_geoout))

#Vista del dataset definitivo antes de generar .json
display(df_geoout.head())

1702


Unnamed: 0,category_code,founded_year,name,number_of_employees,offices,total_money_raised
57,web,2003,Ikan,5.0,"{'type': 'Point', 'coordinates': [-73.563878, ...",0
62,web,2007,Pownce,6.0,"{'type': 'Point', 'coordinates': [-122.397224,...",0
86,web,2005,ClipBlast!,15.0,"{'type': 'Point', 'coordinates': [-118.756618,...",0
93,web,2006,Zamzar,2.0,"{'type': 'Point', 'coordinates': [-1.3610845, ...",0
112,web,2007,CrowdVine,10.0,"{'type': 'Point', 'coordinates': [-122.511687,...",0


In [12]:
#Creación del fichero .json para generar nueva 'collection' en MongoDB
df_geoout.to_json('cost.json', orient="records", lines=True)

### Se importa .json en MongoDB para generar collection y aplicar operador geoespacial '$near'
Nombre de la collection creada: **cost**

In [13]:
#Se define función para aplicar operador geoespacial '$near'
def concentration(company):
    cursor_near = db.cost.find({
      "offices": {
        "$near": {
          "$geometry": {
            "type": "Point",
            "coordinates": [company['offices']['coordinates'][0], company['offices']['coordinates'][1]]
          },
          "$minDistance": 0,
          "$maxDistance":10000
        }
      }
    })
    return list(cursor_near)

In [14]:
#Aplicación de la función para cada documento de la colección respecto al resto de documentos
cursor_geojson = db.cost.find()
clusters = []
for element in cursor_geojson:
    clusters.append(len(concentration(element)))

#Verificación de la longitud del cursor (total de iteraciones).
print(len(clusters),'\n')

#Visualización del total de coincidencias para cada documento.
print(clusters,'\n')

#Máximo número de coincidencias.
print(max(clusters),'\n')

1702 

[4, 1, 3, 62, 3, 27, 28, 1, 62, 40, 15, 1, 1, 62, 19, 9, 1, 17, 1, 3, 13, 3, 1, 2, 17, 6, 7, 62, 6, 11, 12, 1, 1, 5, 10, 3, 61, 3, 4, 1, 2, 20, 62, 2, 2, 23, 1, 2, 10, 6, 16, 15, 23, 22, 18, 9, 2, 6, 6, 7, 61, 4, 15, 82, 82, 5, 6, 4, 29, 15, 6, 1, 3, 3, 14, 63, 21, 12, 1, 1, 5, 2, 1, 2, 1, 6, 13, 1, 1, 7, 23, 21, 3, 13, 1, 22, 19, 23, 47, 82, 12, 23, 1, 25, 1, 3, 2, 61, 80, 29, 61, 3, 1, 82, 2, 3, 13, 3, 61, 24, 16, 1, 7, 8, 9, 16, 16, 82, 2, 2, 1, 3, 61, 6, 2, 20, 23, 1, 3, 8, 1, 7, 5, 6, 43, 1, 42, 38, 29, 15, 7, 18, 16, 35, 1, 1, 5, 2, 11, 4, 3, 1, 61, 35, 82, 82, 34, 82, 1, 51, 1, 1, 3, 20, 1, 1, 62, 23, 29, 12, 1, 8, 1, 1, 1, 81, 17, 5, 2, 1, 7, 1, 1, 1, 22, 83, 17, 2, 2, 62, 6, 1, 3, 19, 3, 1, 1, 36, 14, 61, 12, 62, 5, 82, 16, 1, 35, 6, 4, 37, 9, 29, 8, 18, 1, 9, 21, 27, 8, 3, 18, 29, 13, 1, 1, 1, 62, 5, 1, 21, 8, 6, 1, 2, 17, 7, 35, 21, 3, 7, 1, 82, 7, 17, 82, 1, 21, 5, 3, 3, 3, 2, 45, 1, 5, 21, 2, 4, 1, 25, 3, 1, 6, 1, 2, 24, 2, 3, 2, 10, 5, 63, 2, 2, 22, 18, 4, 2, 6, 62

In [15]:
#Extracción de coordenadas geográficas con mayor número de coincidencias dentro del criterio.
target = list(db.cost.find())
targets = []
for i in range(len(clusters)):
    if clusters[i] == max(clusters):
        targets.append([target[i]['offices']['coordinates'][0], target[i]['offices']['coordinates'][1]])

#Visualización de los pares de coordenadas resultantes.
print(targets)

[[-74.00118, 40.718871], [-73.991924, 40.728623], [-73.9447994, 40.727434], [-74.005398, 40.7166822], [-73.976169, 40.727763], [-73.9867361, 40.7297485], [-73.9893359, 40.7277434]]


In [16]:
#Creación de dataframe para Plateau.
long_lat = ['Longitude','Latitude']
df_cost = pd.DataFrame(targets, columns=long_lat)
df_cost['Criteria'] = 'Cost'

#Visualización del dataframe final dentro del criterio.
display(df_cost)

Unnamed: 0,Longitude,Latitude,Criteria
0,-74.00118,40.718871,Cost
1,-73.991924,40.728623,Cost
2,-73.944799,40.727434,Cost
3,-74.005398,40.716682,Cost
4,-73.976169,40.727763,Cost
5,-73.986736,40.729748,Cost
6,-73.989336,40.727743,Cost


In [17]:
#Creación de fichero .json para visualización en Plateau.
df_cost.to_json('df_cost_10km.json', orient="records", lines=True)