In [23]:
# Importacion de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

%matplotlib inline

plt.style.use('default') # para graficos matplotlib
plt.rcParams['figure.figsize'] = (10, 6)

sns.set(style="whitegrid") # grid seaborn

pd.options.display.float_format = '{:20,.3f}'.format # notacion output

# Acceso a set de datos

In [24]:
path = "/home/seba/Escritorio/Datos/TP1/data/"
df_props_full = pd.read_csv(path + "train.csv")

# Información básica

In [25]:
df_props_full.duplicated().value_counts()

False    240000
dtype: int64

In [26]:
# Cantidad propiedades
df_props_full.shape

(240000, 23)

In [27]:
# Informacion de las columnas
df_props_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240000 entries, 0 to 239999
Data columns (total 23 columns):
id                            240000 non-null int64
titulo                        234613 non-null object
descripcion                   238381 non-null object
tipodepropiedad               239954 non-null object
direccion                     186928 non-null object
ciudad                        239628 non-null object
provincia                     239845 non-null object
antiguedad                    196445 non-null float64
habitaciones                  217529 non-null float64
garages                       202235 non-null float64
banos                         213779 non-null float64
metroscubiertos               222600 non-null float64
metrostotales                 188533 non-null float64
idzona                        211379 non-null float64
lat                           116512 non-null float64
lng                           116512 non-null float64
fecha                         240

<span style="color:red">* Latitud y Longitud tienen gran cantidad de valores nulos

In [28]:
df_props_full.describe()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
count,240000.0,196445.0,217529.0,202235.0,213779.0,222600.0,188533.0,211379.0,116512.0,116512.0,240000.0,240000.0,240000.0,240000.0,240000.0,240000.0
mean,149969.382,8.116,2.902,1.547,2.132,174.017,176.765,2423467.825,20.697,-99.509,0.062,0.055,0.087,0.444,0.397,2530838.346
std,86634.58,9.554,0.897,0.854,0.913,98.153,94.427,10567938.364,3.138,9.744,0.242,0.228,0.282,0.497,0.489,2152551.798
min,1.0,0.0,1.0,0.0,1.0,15.0,15.0,22.0,-100.887,-125.859,0.0,0.0,0.0,0.0,0.0,310000.0
25%,74930.75,0.0,2.0,1.0,1.0,90.0,102.0,24890.0,19.36,-100.978,0.0,0.0,0.0,0.0,0.0,952772.5
50%,149875.5,5.0,3.0,2.0,2.0,153.0,155.0,56383.0,19.543,-99.24,0.0,0.0,0.0,0.0,0.0,1850000.0
75%,225016.5,10.0,3.0,2.0,3.0,240.0,238.0,87838.0,20.74,-99.135,0.0,0.0,0.0,1.0,1.0,3390000.0
max,299999.0,80.0,10.0,3.0,4.0,439.0,439.0,50003999.0,83.026,121.036,1.0,1.0,1.0,1.0,1.0,12525000.0


<span style="color:red">
    * Gimnasio, usosmultiples, piscina, escuelascercanas y centroscomerciales son todos valores binarios <br />
    * Antiguedad, habitaciones, garages y baños se pueden representar en UINT8 <br />
    * Metroscubiertos y metrostotales se pueden representar con UINT16<br />
    * id, idzona y precio se pueden representar con UINT32<br \>
    * Latitud y Longitud tienen valores FLOAT. Si se redondea se puede convertir a INT8

In [29]:
df_props_full.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,...,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,depto. tipo a-402,"depto. interior de 80.15m2, consta de sala com...",Apartamento,Avenida Division del Norte 2005,Benito Juárez,Distrito Federal,,2.000,1.000,...,23533.000,,,2015-08-23 00:00:00,0.000,0.000,0.000,0.000,0.000,2273000.000
1,53461,condominio horizontal en venta,"<p>entre sonora y guerrero, atr&aacute;s del h...",Casa en condominio,AV. MEXICO,La Magdalena Contreras,Distrito Federal,10.000,3.000,2.000,...,24514.000,19.310,-99.228,2013-06-28 00:00:00,0.000,0.000,0.000,1.000,1.000,3600000.000
2,247984,casa en venta urbi 3 recamaras tonala,descripcion \nla mejor ubicacion residencial e...,Casa,Urbi Tonala,Tonalá,Jalisco,5.000,3.000,2.000,...,48551.000,,,2015-10-17 00:00:00,0.000,0.000,0.000,0.000,0.000,1200000.000
3,209067,casa sola en toluca zinacantepec con credito i...,casa en privada con caseta de vigilancia casas...,Casa,IGNACIO MANUEL ALTAMIRANO 128,Zinacantepec,Edo. de México,1.000,2.000,1.000,...,53666.000,19.302,-99.688,2012-03-09 00:00:00,0.000,0.000,0.000,1.000,1.000,650000.000
4,185997,paseos del sol,bonito departamento en excelentes condiciones ...,Apartamento,PASEOS DEL SOL,Zapopan,Jalisco,10.000,2.000,1.000,...,47835.000,,,2016-06-07 00:00:00,0.000,0.000,0.000,0.000,0.000,1150000.000
5,126147,departamento en venta taxqueña,"amplio departamento, estancia de sala y comedo...",Apartamento,Condominio Tlalpan 2B,Coyoacán,Distrito Federal,5.000,2.000,1.000,...,23650.000,19.301,-99.148,2014-03-18 00:00:00,0.000,0.000,0.000,0.000,1.000,1100000.000
6,139233,de oportunidad casa en san lorenzo,"ubicada en esquina, pertenece san lorenzo agen...",Casa,,Oaxaca de Juárez,Oaxaca,,3.000,1.000,...,73510.000,17.144,-96.804,2016-02-23 00:00:00,0.000,0.000,0.000,0.000,0.000,1150000.000
7,5013,casa emilia en venta en selvamar playa del carmen,casa emilia en venta playa del carmenfracciona...,Casa,condominio el trebol,Playa del Carmen,Quintana Roo,2.000,4.000,2.000,...,130510.000,20.673,-87.038,2016-10-20 00:00:00,0.000,0.000,0.000,0.000,0.000,4200000.000
8,44962,pre- venta preciosos depas 2 recamaras con sub...,<p>pre-venta de preciosos departamento ecologi...,Apartamento,BUENAVISTA DEPTOS CON SUBSIDIO,Villa de Alvarez,Colima,1.000,2.000,1.000,...,9010.000,,,2014-01-06 00:00:00,0.000,0.000,0.000,1.000,1.000,310000.000
9,134537,terreno,"terreno de 5.500m2 bardeado, uso de suelo h-20...",Terreno,Av. Morelos,Ixtapaluca,Edo. de México,,,,...,59171.000,19.316,-98.887,2016-12-22 00:00:00,0.000,0.000,0.000,0.000,0.000,6200000.000


# Conversión de tipos

In [30]:
df_props_full['fecha'] = pd.to_datetime(df_props_full['fecha'])

In [31]:
# Convierto todos los valores 1/0 a uint8
df_props_full['gimnasio'] = df_props_full['gimnasio'].astype('uint8')
df_props_full['usosmultiples'] = df_props_full['usosmultiples'].astype('uint8')
df_props_full['piscina'] = df_props_full['piscina'].astype('uint8')
df_props_full['escuelascercanas'] = df_props_full['escuelascercanas'].astype('uint8')
df_props_full['centroscomercialescercanos'] = df_props_full['centroscomercialescercanos'].astype('uint8')

In [32]:
# Convierto los representables en uint8. Utilizo el tipo de pandas UInt8Dtype para evitar conflicto con NaN
df_props_full['antiguedad'] = df_props_full['antiguedad'].astype(pd.UInt8Dtype())
df_props_full['habitaciones'] = df_props_full['habitaciones'].astype(pd.UInt8Dtype())
df_props_full['garages'] = df_props_full['garages'].astype(pd.UInt8Dtype())
df_props_full['banos'] = df_props_full['banos'].astype(pd.UInt8Dtype())

In [33]:
# Convierto los representables en uint16. Utilizo el tipo de pandas UInt16Dtype para evitar conflicto con NaN
df_props_full['metroscubiertos'] = df_props_full['metroscubiertos'].astype(pd.UInt16Dtype())
df_props_full['metrostotales'] = df_props_full['metrostotales'].astype(pd.UInt16Dtype())

In [34]:
# Convierto los representables en uint32. Utilizo el tipo de pandas UInt32Dtype para evitar conflicto con NaN
df_props_full['id'] = df_props_full['id'].astype(pd.UInt32Dtype())
df_props_full['idzona'] = df_props_full['idzona'].astype(pd.UInt32Dtype())
df_props_full['precio'] = df_props_full['precio'].astype(pd.UInt32Dtype())

In [35]:
df_props_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240000 entries, 0 to 239999
Data columns (total 23 columns):
id                            240000 non-null UInt32
titulo                        234613 non-null object
descripcion                   238381 non-null object
tipodepropiedad               239954 non-null object
direccion                     186928 non-null object
ciudad                        239628 non-null object
provincia                     239845 non-null object
antiguedad                    196445 non-null UInt8
habitaciones                  217529 non-null UInt8
garages                       202235 non-null UInt8
banos                         213779 non-null UInt8
metroscubiertos               222600 non-null UInt16
metrostotales                 188533 non-null UInt16
idzona                        211379 non-null UInt32
lat                           116512 non-null float64
lng                           116512 non-null float64
fecha                         240000 non-nu

# Conversion MEX a USD

In [36]:
df_dollar = pd.read_csv(path + 'dollar.csv')
df_dollar = df_dollar.dropna()
df_dollar['Cierre'] = pd.to_numeric(df_dollar['Cierre'])
df_dollar['Cierre'] = df_dollar['Cierre'].round(3)
df_dollar['Fecha'] = pd.to_datetime(df_dollar['Fecha'], format='%d.%m.%Y')
df_dollar = df_dollar.set_index('Fecha')
df_dollar = df_dollar.loc[:, 'Cierre'].to_frame()

In [37]:
# Agrego fechas faltantes (Sabados y Domingos) con valor 0
idx = pd.date_range(start='2011-12-12', end='2017-01-31')
df_dollar = df_dollar.reindex(idx, fill_value=0)

In [38]:
# Cuando se trata de una fecha que corresponde a un Sabado o Domingo no se tiene infromación sobre Cierre
# Le asigno el valor correspondiente al Viernes previo
for i in range(0, len(df_dollar)):
    if (df_dollar.iloc[i]['Cierre'] == 0):
        df_dollar.iloc[i]['Cierre'] = df_dollar.iloc[i-1]['Cierre']

In [39]:
df_props_full['fecha'] = df_props_full['fecha'].apply(lambda x: x.replace(hour=0, minute=0, second=0)) # Seteo tiempo a 00:00:00 para join
df_props_full = df_props_full.set_index('fecha')
df_props_full = df_props_full.join(df_dollar, how='left')
df_props_full = df_props_full.reset_index()
df_props_full.rename(columns = {'index' : 'fecha'}, inplace=True)

In [40]:
df_props_full.rename(columns = {'Cierre' : 'MEX_to_USD', 'precio' : 'Precio_MEX'}, inplace=True)
df_props_full['Precio_USD'] = df_props_full['Precio_MEX'] * df_props_full['MEX_to_USD']
df_props_full['Precio_USD'] = df_props_full['Precio_USD'].astype(int)
df_props_full.head()

Unnamed: 0,fecha,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,...,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,Precio_MEX,MEX_to_USD,Precio_USD
0,2012-01-01,270716,,ubicado en el mejor desarrollo residencial de ...,Apartamento,Bosques de Oyameles TII 3A,Morelia,Michoacán,3,2,...,,,0,0,0,1,0,950000,0.072,68400
1,2012-01-01,216691,casa en venta en valle dorado,preciosa casa de dos plantas con jardin. fracc...,Casa,Amatista,San Luis Potosí,San luis Potosí,5,3,...,22.131,-100.935,0,0,0,0,0,970000,0.072,69840
2,2012-01-01,120625,,"en esquina, excelente vista, finos acabados, j...",Casa,Naolinco 326,Querétaro,Querétaro,0,3,...,,,0,0,0,1,1,6500000,0.072,467999
3,2012-01-01,147802,,"residencia de lujo, totalmente amueblada con h...",Casa en condominio,Manzanillo 8,Chapala,Jalisco,8,3,...,,,0,0,0,1,1,3590000,0.072,258479
4,2012-01-01,299053,casa en venta en universidades,casa en calle valle de anahuac en lomas del va...,Casa,Valle de Anahuac 74,Puebla,Puebla,1,3,...,,,0,0,0,1,1,1120000,0.072,80640
