# Tableau Project (Criterio 3: df_communications)

In [None]:
#Importo librerias.
import pymongo
import pandas as pd
import re

#Creo cursor y dataframe inicial.
MongoClient = pymongo.MongoClient
client = MongoClient()
db = client.companies
cursor = db.companies.find()
data = list(cursor)
df = pd.DataFrame(data)

In [2]:
#Análisis de atributos
col_float = ['deadpooled_day','deadpooled_month','deadpooled_year','founded_day','founded_month',
             'founded_year','number_of_employees']
col_obj = ['_id','acquisition','acquisitions','alias_list','blog_feed_url','blog_url','category_code',
           'competitions','created_at','crunchbase_url','deadpooled_url','description','email_address',
        'external_links','funding_rounds','homepage_url','image','investments','ipo','milestones', 
           'name','offices','overview','partners','permalink','phone_number','products','providerships',
        'relationships','screenshots','tag_list','total_money_raised','twitter_username','updated_at',
           'video_embeds']
col_drop1 = ['deadpooled_day','deadpooled_month','founded_day','founded_month','alias_list','blog_feed_url',
             'blog_url','created_at','crunchbase_url','deadpooled_url','email_address','external_links',
            'image','permalink','phone_number','screenshots','tag_list','updated_at','video_embeds']
col_drop2 = ['acquisition','acquisitions','competitions','description','funding_rounds','homepage_url',
             'investments','ipo','milestones','partners','products','providerships','relationships',
             'twitter_username','overview',]
col_ok = ['deadpooled_year','founded_year','number_of_employees','_id','category_code','name','offices',
          'total_money_raised']

In [3]:
#Primera fase de limpieza de datos.
df_drop1 = df.drop(col_drop1, axis=1)
df_drop2 = df_drop1.drop(col_drop2, axis=1)

#Eliminación de registros correspondientes a empresas en 'deadpool'.
df_deadnull = df_drop2[df_drop2['deadpooled_year'].isnull()]

#Eliminación de registros con valor 'null'
df_nulls = df_deadnull.dropna(subset=['founded_year','number_of_employees','name','offices',
          'total_money_raised'])
#Verificación de la longitud del nuevo dataset.
print(len(df_nulls))

7934


In [4]:
#Elección de columnas relevantes para el criterio.
col_cat = ['games_video','web','software','mobile','ecommerce','network_hosting','hardware','semiconductor']
df_cat = df_nulls[df_nulls['category_code'].isin(col_cat)]\
.drop(['_id','deadpooled_year'], axis=1)

#Verificación de la longitud del nuevo dataset.
print(len(df_cat))

5052


In [5]:
#Definición de funciones para modificación de tipo de datos.

#Transforma un dato tipo float a uno tipo int.
def float_to_int(flt):
    integer = int(flt)
    return integer

#Transforma un dato tipo str en int devolviendo solo el número.
def str_to_int(strg):
    string = re.findall('\d+', strg )
    integer = int(string[0])*1000
    return integer

In [6]:
#Aplicación de funciones sobre los registros.
df_regout = df_cat.copy()
df_regout['total_money_raised'] = df_regout['total_money_raised'].apply(str_to_int)
df_regout['founded_year'] = df_regout['founded_year'].apply(float_to_int)

#Eliminación de registros irrelevantes según criterio.
#df_regout = df_regout[df_regout.total_money_raised == 0]
df_regout = df_regout[df_regout.number_of_employees > 0]
df_regout = df_regout[df_regout['offices'].map(len) > 0]

#Verificación de la longitud del nuevo dataset.
print(len(df_regout))

4155


In [7]:
#Función para la creación de los valores de ubicación en formato GeoJson.
def nested_to_list(dicts):
    lat_lon = [{"type":"Point","coordinates":[l['longitude'],l['latitude']]} for l in dicts]
    lst = lat_lon[0]
    long = lst['coordinates'][0]
    if long != None:
        return lst
    else:
        return 0

In [8]:
#Aplicación de función para la creación de los valores de ubicación en formato GeoJson.
df_colout = df_regout.copy()
df_colout['offices'] = df_colout['offices'].apply(nested_to_list)

In [9]:
#Eliminación de valores de GeoJson nulos.
df_geoout = df_colout.copy()
df_geoout = df_geoout[df_geoout.offices != 0]

#Verificación de la longitud del nuevo dataset.
print(len(df_geoout))

3108


In [10]:
#Generación de dataframe con empresas A-round start-up (10-30 empleados) y menos de 500k euros raised
df_startup = df_geoout.copy()
df_startup = df_startup[df_startup.total_money_raised <= 500000]
df_startup = df_startup[df_startup.number_of_employees <= 30]
df_startup = df_startup[df_startup.number_of_employees >= 10]

#Verificación de la longitud del nuevo dataset.
print(len(df_startup))

845


In [11]:
#Generación de dataframe con empresas de dimensión "big" company (>50 empleados)
df_bigcomp = df_geoout.copy()
df_bigcomp = df_bigcomp[df_bigcomp.number_of_employees >= 50]

#Verificación de la longitud del nuevo dataset.
print(len(df_bigcomp))

660


In [12]:
#Generación de dataframe con ratio equilibrado entre big companies y startups
df_ratio = df_startup.append(df_bigcomp)

#Verificación de la longitud del nuevo dataset.
print(len(df_ratio))

#Vista del dataset definitivo antes de generar .json
display(df_ratio.head())

1505


Unnamed: 0,category_code,founded_year,name,number_of_employees,offices,total_money_raised
6,web,2006,Geni,18.0,"{'type': 'Point', 'coordinates': [-118.393064,...",16000
54,games_video,2007,AdaptiveBlue,15.0,"{'type': 'Point', 'coordinates': [-74.3372, 40...",24000
56,games_video,2004,Pando Networks,23.0,"{'type': 'Point', 'coordinates': [-73.99873, 4...",11000
69,web,2007,SodaHead,25.0,"{'type': 'Point', 'coordinates': [-119.306607,...",12000
86,web,2005,ClipBlast!,15.0,"{'type': 'Point', 'coordinates': [-118.756618,...",0


In [13]:
#Creación del fichero .json para generar nueva 'collection' en MongoDB
df_ratio.to_json('communications.json', orient="records", lines=True)

### Se importa .json en MongoDB para generar collection y aplicar operador geoespacial '$near'
Nombre de la collection creada: **communications**

In [30]:
#Se define función para aplicar operador geoespacial '$near'
def concentration(company):
    cursor_near = db.communications.find({
      "offices": {
        "$near": {
          "$geometry": {
            "type": "Point",
            "coordinates": [company['offices']['coordinates'][0], company['offices']['coordinates'][1]]
          },
          "$minDistance": 0,
          "$maxDistance":5000
        }
      }
    })
    return list(cursor_near)

In [31]:
#Aplicación de la función para cada documento de la colección respecto al resto de documentos
cursor_geojson = db.communications.find()
clusters = []
for element in cursor_geojson:
    clusters.append(len(concentration(element)))

#Verificación de la longitud del cursor (total de iteraciones).
print(len(clusters),'\n')

#Visualización del total de coincidencias para cada documento.
print(clusters,'\n')

#Máximo número de coincidencias.
print(max(clusters),'\n')

1505 

[2, 1, 2, 83, 10, 7, 26, 25, 1, 80, 82, 3, 27, 20, 14, 24, 82, 80, 24, 2, 3, 80, 32, 2, 17, 1, 82, 2, 33, 82, 1, 1, 11, 18, 10, 1, 17, 14, 69, 21, 1, 2, 16, 5, 24, 1, 14, 11, 10, 8, 1, 12, 24, 13, 5, 4, 80, 34, 2, 9, 4, 2, 72, 81, 7, 32, 80, 1, 12, 3, 10, 80, 1, 2, 11, 25, 19, 9, 1, 5, 5, 28, 24, 82, 7, 2, 24, 2, 1, 1, 1, 38, 31, 15, 9, 2, 27, 81, 7, 83, 39, 13, 8, 1, 85, 17, 8, 77, 76, 10, 5, 1, 8, 1, 12, 17, 7, 1, 44, 5, 36, 10, 2, 1, 5, 2, 4, 7, 73, 1, 4, 37, 1, 30, 1, 1, 1, 9, 83, 25, 1, 2, 1, 2, 82, 1, 11, 4, 8, 2, 11, 1, 7, 4, 1, 7, 10, 1, 4, 5, 2, 17, 2, 27, 3, 8, 1, 1, 1, 80, 81, 1, 4, 2, 1, 1, 2, 72, 4, 82, 1, 5, 80, 1, 2, 2, 5, 27, 1, 6, 10, 7, 2, 1, 26, 82, 10, 9, 10, 4, 81, 40, 1, 1, 1, 42, 1, 7, 81, 3, 2, 80, 5, 1, 11, 9, 4, 6, 2, 7, 1, 32, 13, 2, 17, 56, 5, 3, 1, 80, 13, 4, 8, 1, 27, 2, 10, 2, 2, 4, 7, 82, 83, 37, 3, 9, 1, 10, 80, 23, 5, 2, 5, 4, 6, 11, 1, 2, 1, 2, 1, 10, 1, 1, 1, 1, 73, 1, 1, 4, 16, 3, 4, 3, 1, 3, 1, 9, 1, 80, 29, 1, 2, 2, 1, 2, 8, 53, 1, 17, 7, 1

In [32]:
#Extracción de coordenadas geográficas con mayor número de coincidencias dentro del criterio.
target = list(db.communications.find())
targets = []
for i in range(len(clusters)):
    if clusters[i] == max(clusters):
        targets.append([target[i]['offices']['coordinates'][0], target[i]['offices']['coordinates'][1]])

#Visualización de los pares de coordenadas resultantes.
print(targets)

[[-73.981534, 40.738832]]


In [33]:
#Creación de dataframe para Plateau.
long_lat = ['Longitude','Latitude']
df_communications = pd.DataFrame(targets, columns=long_lat)
df_communications['Criteria'] = 'Communications'

#Visualización del dataframe final dentro del criterio.
display(df_communications)

Unnamed: 0,Longitude,Latitude,Criteria
0,-73.981534,40.738832,Communications


In [34]:
#Creación de fichero .json para visualización en Plateau.
df_communications.to_json('df_communications.json', orient="records", lines=True)