In [1]:
import pandas as pd
import numpy as np

In [2]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
# source: https://gist.github.com/rozanecm/38f2901c592bdffc40726cb0473318cf
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 1, freq = 1500):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [4]:
train = pd.read_csv('../data/train_with_desc_full.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])
test = pd.read_csv('../data/test_with_desc_full.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])

In [5]:
rescued_coords = pd.read_csv('../data/rescueLatLongs.csv')
rescued_antiguedad = pd.read_csv('../data/imputations/antiguedad.csv')
rescued_banos = pd.read_csv('../data/imputations/banos.csv')
rescued_garages = pd.read_csv('../data/imputations/garages.csv')
rescued_habitaciones = pd.read_csv('../data/imputations/habitaciones.csv')
rescued_metroscubiertos = pd.read_csv('../data/imputations/metroscubiertos.csv')
rescued_metrostotales = pd.read_csv('../data/imputations/metrostotales.csv')

In [6]:
# mergeamos con coords. extra obtenidas en tp1.
train = train.merge(rescued_coords.drop('Unnamed: 0', axis=1), how='left', on='id')
train['lat_x'] = train.apply(lambda x: x['lat_y'] if pd.isna(x['lat_x']) else x['lat_x'], axis=1)
train['lng_x'] = train.apply(lambda x: x['lng_y'] if pd.isna(x['lng_x']) else x['lng_x'], axis=1)
train.drop(['lat_y','lng_y'], axis=1, inplace=True)
train.rename(columns={'lat_x':'lat','lng_x':'lon'}, inplace=True)

# por consistencia, para que ambos datasets tengan mismos nombres
test.rename(columns={'lng':'lon'}, inplace=True)

In [7]:
# Nan para los datos fuera de rango es mejor que dropear todo el dato
train.loc[(train['lat']>14) | (train['lat']<33),['lat','lon']] = np.nan
train.loc[(train['lon']>86) | (train['lon']<118),['lat','lon']] = np.nan

In [8]:
# inf. values don't make sense. I think it's preferable to treat them as nans directly.
train.replace([np.inf, -np.inf], np.nan, inplace=True)

In [9]:
test.loc[(train['lat']>14) | (train['lat']<33),['lat','lon']] = np.nan
test.loc[(train['lon']>86) | (train['lon']<118),['lat','lon']] = np.nan

In [10]:
test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [11]:
def fillna_with_models_predictions(df, predictions_df, col_name):
    indicadora_name = "tiene_" + col_name
    df[indicadora_name] = df[col_name].notna()
    
    df = df.merge(predictions_df, how='left', on='id')
    original_col = col_name + "_x"
    filler_col = col_name + "_y"
    df[col_name] = df.apply(lambda x: x[filler_col] if pd.isna(x[original_col]) else x[original_col], axis=1)
    df.drop([original_col,filler_col], axis=1, inplace=True)
    
    return df

In [12]:
def fill_na_values(df):
    df = fillna_with_models_predictions(df, rescued_antiguedad, 'antiguedad')
    df = fillna_with_models_predictions(df, rescued_banos, 'banos')
    df = fillna_with_models_predictions(df, rescued_garages, 'garages')
    df = fillna_with_models_predictions(df, rescued_habitaciones, 'habitaciones')
    df = fillna_with_models_predictions(df, rescued_metroscubiertos, 'metroscubiertos')
    df = fillna_with_models_predictions(df, rescued_metrostotales, 'metrostotales')
    return df

In [13]:
from multiprocessing import  Pool

def parallelize_dataframe(df, func, n_cores):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [14]:
train = parallelize_dataframe(train, fill_na_values, 8)
test = parallelize_dataframe(test, fill_na_values, 8)

# Agregamos features que ya hemos creado para analisis de tp1

In [15]:
def contar_amenities(row):
    return row['gimnasio'] + row['usosmultiples'] + row['piscina'] + row['escuelascercanas'] + row['centroscomercialescercanos']

In [16]:
train['cant_amenities'] = train.apply(lambda x: contar_amenities(x), axis=1)
test['cant_amenities'] = test.apply(lambda x: contar_amenities(x), axis=1)

In [17]:
def feature_fechas(df):
    # Para entender lo de los senos y cosenos: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/
    df['year'] = df['fecha'].dt.year
    df['month'] = df['fecha'].dt.month
    df['day'] = df['fecha'].dt.day
    df['sin_month'] = np.sin(2*np.pi*df['month']/12)
    df['cos_month'] = np.cos(2*np.pi*df['month']/12)
    # tomo cant. de dias en mes: 31 en todos los casos. Para esto deberia servir bastante bien igual.
    df['sin_day'] = np.sin(2*np.pi*df['day']/31)
    df['cos_day'] = np.cos(2*np.pi*df['day']/31)
    
    # no necesito mas las cols. originales de month y day.
    df.drop(['month','day'], axis=1, inplace=True)
    
feature_fechas(train)
feature_fechas(test)

In [18]:
palabras_avenida = ['avenida', 'av']
train['es_avenida'] = train['direccion'].fillna('no info').apply(lambda x: any(avenida_indicator in x.lower() for avenida_indicator in palabras_avenida))
test['es_avenida'] = test['direccion'].fillna('no info').apply(lambda x: any(avenida_indicator in x.lower() for avenida_indicator in palabras_avenida))

# * Fin agregado de features de tp1 *

El sample submission no tiene header. **Ojo con eso al guardar la submission.** Hagamos la funcion para guardar submissions ahora, para evitar problemas a futuro y despreocuparnos.

In [19]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fcozza", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

In [20]:
# Define a seed, so all algorithms that accept a seed, take the same, for consistency reasons,
# so everything can be replicated without problems random state
seed=42

In [21]:
X = train.drop('precio', axis=1) #set de datos
y = train['precio'] #target

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

***

In [23]:
X_train.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,idzona,lat,lon,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,cant_comodidades_en_desc,cant_palabras_positivas,cant_areas_dedicadas,cant_areas_verdes,cant_areas_entretenimiento_cerca,cant_lugares_cerca,planta_alta,planta_baja,tiene_bodega,oficina,cerca_o_en_esquina,cerca_o_en_avenida,comercial,tiene_servicio,edificio,casa,parte_de_lote,calle_cerrada,indica_frente_y_fondo,usa_easybroker,tiene_seguridad,tiene_antiguedad,antiguedad,tiene_banos,banos,tiene_garages,garages,tiene_habitaciones,habitaciones,tiene_metroscubiertos,metroscubiertos,tiene_metrostotales,metrostotales,cant_amenities,year,sin_month,cos_month,sin_day,cos_day,es_avenida
6835,88522,tres lagos moderno departamento,<p>funcional departamento en un nuevo concepto...,Apartamento,"Edif. Bruselas, Laguna de Mayrán No. 166, Tres...",Miguel Hidalgo,Distrito Federal,,,,2013-10-11,True,True,True,True,True,4.0,4.0,3.0,1.0,0.0,0.0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,True,8.0,True,2.0,True,2.0,True,2.0,True,80.0,True,98.0,5,2013,-0.866025,0.5,0.790776,-0.612106,False
3906,45078,venta casa ciudad brisa remodelada cerca escue...,hermosa casa remodelada con muy buenos espacio...,Casa,CIRCUNVALACION PONIENTE 43,Naucalpan de Juárez,Edo. de México,55904.0,,,2015-09-29,False,False,False,True,True,4.0,4.0,3.0,1.0,0.0,0.0,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,10.0,True,3.0,True,3.0,True,4.0,True,290.0,True,290.0,2,2015,-1.0,-1.83697e-16,-0.394356,0.918958,False
615,37930,quinta en venta en monte bello,"id:45947, hermosa quinta con casa de 100 mts2 ...",Quinta Vacacional,,Juárez,Nuevo León,,,,2016-04-24,False,False,False,False,False,1.0,1.0,0.0,0.0,0.0,0.0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,7.7196,True,1.0,False,0.134836,True,2.0,True,235.0,False,235.285333,0,2016,0.866025,-0.5,-0.988468,0.151428,False
17141,128563,"bonita casa, dos niveles, 3 rec., 3 est., 20 m...","bonita casa en dos plantas, en planta baja, 3 ...",Casa,-,Tlalnepantla de Baz,Edo. de México,56224.0,,,2015-10-20,False,False,False,True,True,6.0,4.0,4.0,1.0,0.0,0.0,False,True,True,True,False,False,False,False,False,True,False,False,False,False,False,True,20.0,True,2.0,True,3.0,True,3.0,True,270.0,False,220.016157,2,2015,-0.866025,0.5,-0.790776,-0.612106,False
19102,152060,"casa en venta en nuestra señora de fatima, sal...",<p>fracc. fatima residencial ubicado en car.. ...,Casa,Santa Barbara 155,Saltillo,Coahuila,5864.0,,,2013-09-25,False,False,False,True,True,5.0,0.0,3.0,0.0,0.0,0.0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,15.0,True,2.0,True,2.0,True,3.0,True,185.0,True,200.0,2,2013,-1.0,-1.83697e-16,-0.937752,0.347305,False


# Approach 1 - XGBoost con CV solo features de matriz de correlacion(Grid Search)

- https://towardsdatascience.com/feature-selection-for-machine-learning-1-2-1597d9ccb54a 
- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

In [35]:
X_train = X_train[['gimnasio','usosmultiples','piscina','cant_palabras_positivas','cant_areas_verdes','tiene_bodega',\
                'tiene_servicio','tiene_seguridad','banos','garages','habitaciones','metroscubiertos','metrostotales','year',\
                ]]

In [37]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

params = {  
    "n_estimators": [200],
    "max_depth": [20,22,25],
    "learning_rate": [0.06], 
    "colsample_bytree": [1],
    "subsample": [0.946934], 
    "gamma":[30],
    'reg_alpha': [10],
    "min_child_weight": [11]
}

regXGB = xgb.XGBRegressor(objective ='reg:squarederror',nthread=-1) 

regXGBwithCV = GridSearchCV(regXGB, params, n_jobs=-1,verbose=10,cv=3) # n_iters es la cant de veces que busca, 10 es lo default

regXGBwithCV.fit(X_train, y_train, eval_metric="rmse")

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:  2.1min remaining:  7.2min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:  2.1min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  2.2min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:  2.2min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  2.3min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:  2.4min remaining:   41.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  3.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  3.3min finished
  if getattr(data, 'base', None) is not None and \


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=-1,
                                    objective='reg:squarederror',
                                    random_stat...
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [1], 'gamma': [30],
                         'learning_rate': [0.06], 'max_depth': [20, 22, 25],
      

In [38]:
print(regXGBwithCV.best_score_)
print(regXGBwithCV.best_params_)

0.568280850314891
{'colsample_bytree': 1, 'gamma': 30, 'learning_rate': 0.06, 'max_depth': 20, 'min_child_weight': 11, 'n_estimators': 200, 'reg_alpha': 10, 'subsample': 0.946934}


# Approach 1 - XGBoost con CV solo features de BoostARoota (Grid Search)

In [23]:
X_train = X_train[['gimnasio','usosmultiples','centroscomercialescercanos','cant_palabras_positivas','cant_areas_dedicadas',\
                   'cant_areas_verdes','cant_areas_entretenimiento_cerca','planta_alta','planta_baja','tiene_bodega',\
                   'comercial','tiene_servicio','edificio','casa','usa_easybroker','tiene_seguridad','tiene_antiguedad',\
                   'antiguedad','tiene_banos','banos','tiene_garages','garages','tiene_habitaciones','habitaciones',\
                   'tiene_metroscubiertos','metroscubiertos','tiene_metrostotales','metrostotales','cant_amenities','year',\
                   'sin_month','sin_day'
                ]]

In [24]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

params = {  
    "n_estimators": [200],
    "max_depth": [20,22,25],
    "learning_rate": [0.06], 
    "colsample_bytree": [1],
    "subsample": [0.946934], 
    "gamma":[30],
    'reg_alpha': [10],
    "min_child_weight": [11]
}

regXGB = xgb.XGBRegressor(objective ='reg:squarederror',nthread=-1) 

regXGBwithCV = GridSearchCV(regXGB, params, n_jobs=-1,verbose=10,cv=3) # n_iters es la cant de veces que busca, 10 es lo default

regXGBwithCV.fit(X_train, y_train, eval_metric="rmse")

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:  3.8min remaining: 13.4min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:  3.8min remaining:  7.7min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  4.1min remaining:  5.2min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:  4.2min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  4.2min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:  4.5min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  6.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  6.6min finished
  if getattr(data, 'base', None) is not None and \


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=-1,
                                    objective='reg:squarederror',
                                    random_stat...
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [1], 'gamma': [30],
                         'learning_rate': [0.06], 'max_depth': [20, 22, 25],
      

In [25]:
print(regXGBwithCV.best_score_)
print(regXGBwithCV.best_params_)

0.6387103697028009
{'colsample_bytree': 1, 'gamma': 30, 'learning_rate': 0.06, 'max_depth': 20, 'min_child_weight': 11, 'n_estimators': 200, 'reg_alpha': 10, 'subsample': 0.946934}
