In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style = "whitegrid")

import geopandas
from shapely.geometry import LineString, Point, Polygon
import shapely.wkt
import folium

In [2]:
def from_wkt(df, wkt_column):    
    # para transformar los wkt using shapely para convertir en objeto geografico
    # clase de shapely
    # geopandas no trae un method para import este formato
    df["geometry"]= df[wkt_column].apply(shapely.wkt.loads)     
    # ahora pasa a geopandas con geometry=serie anterior. un poligono para cada
    # registro de serie
    gdf = geopandas.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry='geometry')  
    return gdf

def from_x_y(df, x, y):
    # lista de points de pares ordenados, set projection (crs)
    gdf = geopandas.GeoDataFrame(df.drop([x, y], axis=1),
                                crs={'init': 'epsg:4326'},
                                geometry=[Point(xy) for xy in zip(df[x], df[y])])  
    return gdf

def to_gkba(geodf):
    '''to transform crs to gkba, utilizada por Buenos Aires Data para medir distancias'''
    new_geodf = geodf.to_crs(crs = "+proj=tmerc +lat_0=-34.629269 +lon_0=-58.4633 +k=0.9999980000000001 +x_0=100000 +y_0=100000 +ellps=intl +units=m +no_defs")
    return new_geodf

In [3]:
# Read data
# Lectura del dataset usando como index la primera columna
props_df = pd.read_csv('properatti_clean&complete.csv', index_col=0)
barrios_df = pd.read_csv("barrios.csv", encoding='latin1')
comisarias_df = pd.read_csv("comisarias.csv", encoding = "latin1")
subtes_df = pd.read_csv("subte.csv", encoding='latin1')
metrobus_df = pd.read_csv("metrobus.csv", encoding='latin1')
trenes_df = pd.read_csv("trenes.csv", encoding='latin1', delimiter=';')

In [4]:
props_df.columns

Index(['id', 'barrio', 'lat', 'lon', 'total', 'cubierta', 'precio', 'piso',
       'habs', 'expensas', 'descripcion', 'ambientes', 'habitaciones_final',
       'pileta', 'seguridad', 'parrilla', 'balcon', 'cochera', 'nuevo'],
      dtype='object')

In [5]:
# Cambiar , por . en comisarias
comisarias_df['X'] = comisarias_df['X'].apply(lambda x: float(x.replace(',', '.')))
comisarias_df['Y'] = comisarias_df['Y'].apply(lambda x: float(x.replace(',', '.')))


In [6]:
barrios_df.head()

Unnamed: 0,WKT,BARRIO,COMUNA,PERIMETRO,AREA
0,"POLYGON ((-58.4528200492791 -34.5959886570639,...",CHACARITA,15.0,7725.695228,3118101.0
1,"POLYGON ((-58.4655768128541 -34.5965577078058,...",PATERNAL,15.0,7087.513295,2229829.0
2,"POLYGON ((-58.4237529813037 -34.5978273383243,...",VILLA CRESPO,15.0,8132.699348,3613584.0
3,"POLYGON ((-58.4946097568899 -34.6148652395239,...",VILLA DEL PARQUE,11.0,7705.389797,3399596.0
4,"POLYGON ((-58.4128700313089 -34.6141162515854,...",ALMAGRO,5.0,8537.901368,4050752.0


In [7]:
subtes_df.head()

Unnamed: 0,X,Y,ID,ESTACION,LINEA
0,-58.398928,-34.63575,1.0,CASEROS,H
1,-58.40097,-34.629376,2.0,INCLAN,H
2,-58.402323,-34.623092,3.0,HUMBERTO 1°,H
3,-58.404732,-34.615242,4.0,VENEZUELA,H
4,-58.406036,-34.608935,5.0,ONCE - 30 DE DICIEMBRE,H


In [8]:
metrobus_df.head()

Unnamed: 0,long,lat,id,nombre,calle1,calle2,intersec,inaugura,lin_sent_n,lin_sent_s,metrobus
0,-58.526113,-34.638267,1,Liniers,FRANCISCO DE VIEDMA,,Entre Casco y Gana,si,34 - 109 - 166,34 - 109 - 166,Metrobus Juan B. Justo
1,-58.520143,-34.633497,2,Velez Sarsfield,JUSTO JUAN B.,ALVAREZ JONTE AV,JUSTO JUAN B. & ALVAREZ JONTE AV,si,34 - 99 - 166 - 172,34 - 99 - 166 - 172,Metrobus Juan B. Justo
2,-58.513913,-34.633458,3,Polideportivo Velez Sarsfield,JUSTO JUAN B.,GARCIA JUAN AGUSTIN,JUSTO JUAN B. & GARCIA JUAN AGUSTIN,si,34 - 99 - 166 - 172,34 - 99 - 166 - 172,Metrobus Juan B. Justo
3,-58.50673,-34.634341,4,Cortina,JUSTO JUAN B.,CORTINA,JUSTO JUAN B. & CORTINA,si,34 - 99 - 166 - 172,34 - 99 - 166 - 172,Metrobus Juan B. Justo
4,-58.501042,-34.632208,5,Av. Lope de Vega,JUSTO JUAN B.,LOPE DE VEGA,JUSTO JUAN B. & LOPE DE VEGA,si,34 - 99 - 166 - 172,34 - 99 - 166 - 172,Metrobus Juan B. Justo


In [9]:
comisarias_df.head()

Unnamed: 0,X,Y,NOMBRE
0,-58.468944,-34.683121,COMISARIA 52
1,-58.474649,-34.679169,COMISARIA 48
2,-58.501166,-34.661994,COMISARIA 42
3,-58.431981,-34.660395,COMISARIA 36
4,-58.40283,-34.64195,COMISARIA COMUNA 4


In [10]:
trenes_df.head()

Unnamed: 0,LAT,LNG,ID,NOMBRE,EMPRESA,LINEA,LINEA_2,BARRIO,COMUNA
0,-34.571334,-58.424295,2,3 DE FEBRERO,TBA - TRENES DE BS AS S.A.,MITRE,F.C.G.B.M.,PALERMO,COMUNA 14
1,-34.567571,-58.463056,4,BELGRANO R,TBA - TRENES DE BS AS S.A.,MITRE,F.C.G.B.M.,BELGRANO,COMUNA 13
2,-34.562549,-58.435864,6,LISANDRO DE LA TORRE,TBA - TRENES DE BS AS S.A.,MITRE,F.C.G.B.M.,PALERMO,COMUNA 14
3,-34.558443,-58.449498,7,BELGRANO C,TBA - TRENES DE BS AS S.A.,MITRE,F.C.G.B.M.,BELGRANO,COMUNA 13
4,-34.548895,-58.4624,8,NUÃEZ,TBA - TRENES DE BS AS S.A.,MITRE,F.C.G.B.M.,NUÃEZ,COMUNA 13


In [11]:
props_df.head()

Unnamed: 0,id,barrio,lat,lon,total,cubierta,precio,piso,habs,expensas,descripcion,ambientes,habitaciones_final,pileta,seguridad,parrilla,balcon,cochera,nuevo
0,2,Mataderos,-34.652262,-58.522982,55.0,55.0,1309.090909,,,,2 AMBIENTES 3ER PISO LATERAL LIVING COMEDOR AM...,2.0,2.0,0,0,0,0,0,0
1,7,Belgrano,-34.559873,-58.443362,45.0,40.0,3066.666667,,,,EXCELENTE MONOAMBIENTE A ESTRENAR AMPLIO SUPER...,1.0,1.0,1,0,0,0,0,0
2,8,Belgrano,-34.559873,-58.443362,65.0,60.0,3000.0,,,,EXCELENTE DOS AMBIENTES ESTRENAR AMPLIO SUPER...,2.0,2.0,1,0,0,0,0,0
3,19,Palermo,-34.580504,-58.405874,104.0,96.0,3365.384615,,3.0,,Excelente semipiso al contra frente en Bulnes ...,,3.0,0,0,0,1,0,0
4,21,Palermo,-34.590926,-58.411665,118.0,73.0,2292.372881,,4.0,,"EXCELENTE ZONA, MULTIPLES MEDIOS DE TRANSPORTE...",,4.0,0,0,0,0,0,0


In [12]:
# Transformar los dataframes en geodataframes
barrios = from_wkt(barrios_df, 'WKT')
comisarias = from_x_y(comisarias_df, 'X', 'Y')
props = from_x_y(props_df, 'lon', 'lat')
subtes = from_x_y(subtes_df, 'X', 'Y')
metrobus = from_x_y(metrobus_df, 'long', 'lat')
trenes = from_x_y(trenes_df, 'LNG', 'LAT')

In [13]:
dfmerged=geopandas.sjoin(props, barrios, how='left')

In [14]:
dfmerged.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 15948 entries, 0 to 15947
Data columns (total 24 columns):
id                    15948 non-null int64
barrio                15948 non-null object
total                 15948 non-null float64
cubierta              15491 non-null float64
precio                15948 non-null float64
piso                  2465 non-null float64
habs                  10109 non-null float64
expensas              3956 non-null float64
descripcion           15948 non-null object
ambientes             11649 non-null float64
habitaciones_final    14495 non-null float64
pileta                15948 non-null int64
seguridad             15948 non-null int64
parrilla              15948 non-null int64
balcon                15948 non-null int64
cochera               15948 non-null int64
nuevo                 15948 non-null int64
geometry              15948 non-null object
index_right           15924 non-null float64
WKT                   15924 non-null object
BAR

In [15]:
dfmerged.rename({'BARRIO': 'barrio_real'}, axis='columns', inplace=True)

In [16]:
dfmerged[dfmerged['barrio_real'].isnull()]

Unnamed: 0,id,barrio,total,cubierta,precio,piso,habs,expensas,descripcion,ambientes,...,balcon,cochera,nuevo,geometry,index_right,WKT,barrio_real,COMUNA,PERIMETRO,AREA
1492,12290,Nuñez,83.0,58.0,2228.915663,1.0,2.0,2300.0,Alquiler de Departamento 2 AMBIENTES en NuñezA...,2.0,...,1,1,0,POINT (-58.521075 -34.5474831),,,,,,
1771,14189,Belgrano,114.0,92.0,2605.263158,,4.0,,Migueletes 800 (Caitas) - SENSACIONAL - IMPECA...,,...,1,1,0,POINT (-58.538654 -34.577168),,,,,,
1772,14190,Belgrano,69.0,65.0,2608.695652,,3.0,,Santos Dumond 2500 - 3Amb al contrafrente - MU...,,...,0,0,0,POINT (-58.538654 -34.577168),,,,,,
2028,16115,Caballito,64.0,64.0,2070.3125,,3.0,,"Oportunidad:3ambientes,lateral,excelente dispo...",3.0,...,0,0,0,POINT (-58.538654 -34.577168),,,,,,
2029,16116,Caballito,54.0,50.0,2277.777778,,2.0,,"DEPARTAMENTO ,CABALLITO 2 AMBIENTES, AMENITIES...",2.0,...,1,0,0,POINT (-58.538654 -34.577168),,,,,,
3427,24940,Villa Luro,46.0,46.0,1630.434783,,2.0,,VILLA LURO: U$S 75.000.- DEPTO. 2 AMBIENTES....,2.0,...,0,0,0,POINT (-58.538654 -34.577168),,,,,,
3471,25126,Belgrano,38.0,38.0,2236.842105,,2.0,,Zona:Residencial - Asfalto - Osn:Cloaca - Lava...,2.0,...,0,0,0,POINT (-58.5447221 -34.5786785),,,,,,
3472,25127,Nuñez,54.0,54.0,1759.259259,,2.0,,ARQ. ALBARRACIN Propiedades ** VENDE*** Cuba ...,3.0,...,0,0,0,POINT (-58.538654 -34.577168),,,,,,
3475,25131,Belgrano,160.0,160.0,3062.5,,4.0,,HERMOSO PISO SOBRE AV. DEL LIBERTADOR AL 4900...,,...,1,1,0,POINT (-58.538654 -34.577168),,,,,,
3498,25238,Villa Devoto,58.0,54.0,2241.37931,,1.0,,Excelente departamento de 3 ambientes al frent...,3.0,...,1,0,0,POINT (-58.538654 -34.577168),,,,,,


In [17]:
# Graficar para ver
m = folium.Map([-34.606359, -58.443863], zoom_start=12, tiles="OpenStreetMap")
folium.GeoJson(barrios.to_json()).add_to(m)
folium.GeoJson(dfmerged[dfmerged['barrio_real'].isnull()].to_json()).add_to(m)
m

<folium.features.GeoJson at 0x7f6f812c5cc0>

<folium.features.GeoJson at 0x7f6f8158def0>

In [18]:
# Eliminar propiedades sin geometry utilizable
print(dfmerged.shape)
dfmerged.dropna(subset=['barrio_real'], inplace=True)
print(dfmerged.shape)



(15948, 24)
(15924, 24)


In [19]:
# Transformar los dataframes para calcular distancias entre puntos

comisarias_gkba = to_gkba(comisarias)
subtes_gkba = to_gkba(subtes)
trenes_gkba = to_gkba(trenes)
metrobus_gkba = to_gkba(metrobus)
dfmerged_gkba=to_gkba(dfmerged)

In [20]:
def mindist(pt, vec):
    dist_array=np.array([pt.distance(el) for el in vec])
    return dist_array.min()
    

In [21]:
dfmerged_gkba.loc[7,'geometry'].distance(comisarias_gkba.loc[45, 'geometry'])

7374.8591222261175

In [22]:
dfmerged['dist_comisarias']=dfmerged_gkba['geometry'].apply(lambda x: mindist(x,comisarias_gkba['geometry']))

In [23]:
dfmerged['dist_subte']=dfmerged_gkba['geometry'].apply(lambda x: mindist(x,subtes_gkba['geometry']))

In [24]:
dfmerged['dist_trenes']=dfmerged_gkba['geometry'].apply(lambda x: mindist(x,trenes_gkba['geometry']))

In [25]:
dfmerged['dist_metrobus']=dfmerged_gkba['geometry'].apply(lambda x: mindist(x,metrobus_gkba['geometry']))

In [26]:
dfmerged.columns

Index(['id', 'barrio', 'total', 'cubierta', 'precio', 'piso', 'habs',
       'expensas', 'descripcion', 'ambientes', 'habitaciones_final', 'pileta',
       'seguridad', 'parrilla', 'balcon', 'cochera', 'nuevo', 'geometry',
       'index_right', 'WKT', 'barrio_real', 'COMUNA', 'PERIMETRO', 'AREA',
       'dist_comisarias', 'dist_subte', 'dist_trenes', 'dist_metrobus'],
      dtype='object')

In [28]:
dfresultado=dfmerged[['id', 'barrio_real', 'total', 'cubierta', 'precio', 'piso', 'habs', 'expensas', 'descripcion', 'ambientes', 'habitaciones_final', 'pileta',
       'seguridad', 'parrilla', 'balcon', 'cochera', 'nuevo', 'geometry', 'dist_comisarias', 'dist_subte',
       'dist_trenes', 'dist_metrobus']]

In [29]:
dfresultado.head()

Unnamed: 0,id,barrio_real,total,cubierta,precio,piso,habs,expensas,descripcion,ambientes,...,seguridad,parrilla,balcon,cochera,nuevo,geometry,dist_comisarias,dist_subte,dist_trenes,dist_metrobus
0,2,LINIERS,55.0,55.0,1309.090909,,,,2 AMBIENTES 3ER PISO LATERAL LIVING COMEDOR AM...,2.0,...,0,0,0,0,0,POINT (-58.5229825 -34.6522615),2133.629728,5444.155315,1512.704661,1578.83638
1,7,BELGRANO,45.0,40.0,3066.666667,,,,EXCELENTE MONOAMBIENTE A ESTRENAR AMPLIO SUPER...,1.0,...,0,0,0,0,0,POINT (-58.443362 -34.5598729),564.095028,1068.595967,585.038122,1019.907337
2,8,BELGRANO,65.0,60.0,3000.0,,,,EXCELENTE DOS AMBIENTES ESTRENAR AMPLIO SUPER...,2.0,...,0,0,0,0,0,POINT (-58.443362 -34.5598729),564.095028,1068.595967,585.038122,1019.907337
3,19,PALERMO,104.0,96.0,3365.384615,,3.0,,Excelente semipiso al contra frente en Bulnes ...,,...,0,0,1,0,0,POINT (-58.4058744847 -34.580503566),719.300376,991.604946,794.459912,1500.489635
4,21,RECOLETA,118.0,73.0,2292.372881,,4.0,,"EXCELENTE ZONA, MULTIPLES MEDIOS DE TRANSPORTE...",,...,0,0,0,0,0,POINT (-58.4116653 -34.590926),527.672388,300.233682,1938.001717,1494.307836


In [30]:
dfresultado.to_csv('properatti_geo.csv')