In [1]:
import pandas as pd
import numpy as np

In [2]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
# source: https://gist.github.com/rozanecm/38f2901c592bdffc40726cb0473318cf
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 0.6, freq = 200):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [4]:
train = pd.read_csv('../data/train_with_desc_full.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])

In [5]:
test = pd.read_csv('../data/test_with_desc_full.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])

In [6]:
rescued_coords = pd.read_csv('../data/rescueLatLongs.csv')
rescued_antiguedad = pd.read_csv('../data/imputations/antiguedad.csv')
rescued_banos = pd.read_csv('../data/imputations/banos.csv')
rescued_garages = pd.read_csv('../data/imputations/garages.csv')
rescued_habitaciones = pd.read_csv('../data/imputations/habitaciones.csv')
rescued_metroscubiertos = pd.read_csv('../data/imputations/metroscubiertos.csv')
rescued_metrostotales = pd.read_csv('../data/imputations/metrostotales.csv')

In [7]:
# mergeamos con coords. extra obtenidas en tp1.
train = train.merge(rescued_coords.drop('Unnamed: 0', axis=1), how='left', on='id')
train['lat_x'] = train.apply(lambda x: x['lat_y'] if pd.isna(x['lat_x']) else x['lat_x'], axis=1)
train['lng_x'] = train.apply(lambda x: x['lng_y'] if pd.isna(x['lng_x']) else x['lng_x'], axis=1)
train.drop(['lat_y','lng_y'], axis=1, inplace=True)
train.rename(columns={'lat_x':'lat','lng_x':'lon'}, inplace=True)

# por consistencia, para que ambos datasets tengan mismos nombres
test.rename(columns={'lng':'lon'}, inplace=True)

In [8]:
# Nan para los datos fuera de rango es mejor que dropear todo el dato
train.loc[(train['lat']>14) | (train['lat']<33),['lat','lon']] = np.nan
train.loc[(train['lon']>86) | (train['lon']<118),['lat','lon']] = np.nan

In [9]:
# inf. values don't make sense. I think it's preferable to treat them as nans directly.
train.replace([np.inf, -np.inf], np.nan, inplace=True)

In [10]:
test.loc[(train['lat']>14) | (train['lat']<33),['lat','lon']] = np.nan
test.loc[(train['lon']>86) | (train['lon']<118),['lat','lon']] = np.nan

In [11]:
test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [12]:
def fillna_with_models_predictions(df, predictions_df, col_name):
    indicadora_name = "tiene_" + col_name
    df[indicadora_name] = df[col_name].notna()
    
    df = df.merge(predictions_df, how='left', on='id')
    original_col = col_name + "_x"
    filler_col = col_name + "_y"
    df[col_name] = df.apply(lambda x: x[filler_col] if pd.isna(x[original_col]) else x[original_col], axis=1)
    df.drop([original_col,filler_col], axis=1, inplace=True)
    
    return df

In [13]:
def fill_na_values(df):
    df = fillna_with_models_predictions(df, rescued_antiguedad, 'antiguedad')
    df = fillna_with_models_predictions(df, rescued_banos, 'banos')
    df = fillna_with_models_predictions(df, rescued_garages, 'garages')
    df = fillna_with_models_predictions(df, rescued_habitaciones, 'habitaciones')
    df = fillna_with_models_predictions(df, rescued_metroscubiertos, 'metroscubiertos')
    df = fillna_with_models_predictions(df, rescued_metrostotales, 'metrostotales')
    return df

In [14]:
from multiprocessing import  Pool

def parallelize_dataframe(df, func, n_cores):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [15]:
train = parallelize_dataframe(train, fill_na_values, 8)
test = parallelize_dataframe(test, fill_na_values, 8)

# Agregamos features que ya hemos creado para analisis de tp1

In [16]:
def contar_amenities(row):
    return row['gimnasio'] + row['usosmultiples'] + row['piscina'] + row['escuelascercanas'] + row['centroscomercialescercanos']

In [17]:
train['cant_amenities'] = train.apply(lambda x: contar_amenities(x), axis=1)
test['cant_amenities'] = test.apply(lambda x: contar_amenities(x), axis=1)

In [18]:
def feature_fechas(df):
    # Para entender lo de los senos y cosenos: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/
    df['year'] = df['fecha'].dt.year
    df['month'] = df['fecha'].dt.month
    df['day'] = df['fecha'].dt.day
    df['sin_month'] = np.sin(2*np.pi*df['month']/12)
    df['cos_month'] = np.cos(2*np.pi*df['month']/12)
    # tomo cant. de dias en mes: 31 en todos los casos. Para esto deberia servir bastante bien igual.
    df['sin_day'] = np.sin(2*np.pi*df['day']/31)
    df['cos_day'] = np.cos(2*np.pi*df['day']/31)
    
    # no necesito mas las cols. originales de month y day.
    df.drop(['month','day'], axis=1, inplace=True)
    
feature_fechas(train)
feature_fechas(test)

In [19]:
palabras_avenida = ['avenida', 'av']
train['es_avenida'] = train['direccion'].fillna('no info').apply(lambda x: any(avenida_indicator in x.lower() for avenida_indicator in palabras_avenida))
test['es_avenida'] = test['direccion'].fillna('no info').apply(lambda x: any(avenida_indicator in x.lower() for avenida_indicator in palabras_avenida))

# * Fin agregado de features de tp1 *

El sample submission no tiene header. **Ojo con eso al guardar la submission.** Hagamos la funcion para guardar submissions ahora, para evitar problemas a futuro y despreocuparnos.

In [20]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fcozza", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

***

In [21]:
#del X, y, X_train, X_test, y_train,y_test

In [22]:
# Define a seed, so all algorithms that accept a seed, take the same, for consistency reasons,
# so everything can be replicated without problems random state
seed=42

In [32]:
X = train.drop('precio', axis=1) #set de datos
y = train['precio'] #target

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [34]:
beep()

***

# Approach 1.1 - XGBoost con CV basico solo features de matriz de correlacion(Grid Search)

- https://towardsdatascience.com/feature-selection-for-machine-learning-1-2-1597d9ccb54a 
- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

In [35]:
X_train = X_train[['gimnasio','usosmultiples','piscina','cant_palabras_positivas','cant_areas_verdes','tiene_bodega',\
                'tiene_servicio','tiene_seguridad','banos','garages','habitaciones','metroscubiertos','metrostotales','year',\
                ]]

In [37]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

params = {  
    "n_estimators": [200],
    "max_depth": [20,22,25],
    "learning_rate": [0.06], 
    "colsample_bytree": [1],
    "subsample": [0.946934], 
    "gamma":[30],
    'reg_alpha': [10],
    "min_child_weight": [11]
}

regXGB = xgb.XGBRegressor(objective ='reg:squarederror',nthread=-1) 

regXGBwithCV = GridSearchCV(regXGB, params, n_jobs=-1,verbose=10,cv=3) # n_iters es la cant de veces que busca, 10 es lo default

regXGBwithCV.fit(X_train, y_train, eval_metric="rmse")

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:  2.1min remaining:  7.2min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:  2.1min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  2.2min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:  2.2min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  2.3min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:  2.4min remaining:   41.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  3.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  3.3min finished
  if getattr(data, 'base', None) is not None and \


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=-1,
                                    objective='reg:squarederror',
                                    random_stat...
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [1], 'gamma': [30],
                         'learning_rate': [0.06], 'max_depth': [20, 22, 25],
      

In [38]:
print(regXGBwithCV.best_score_)
print(regXGBwithCV.best_params_)

0.568280850314891
{'colsample_bytree': 1, 'gamma': 30, 'learning_rate': 0.06, 'max_depth': 20, 'min_child_weight': 11, 'n_estimators': 200, 'reg_alpha': 10, 'subsample': 0.946934}


 No sirve

# Approach 1.2 - XGBoost con CV basico solo features de BoostARoota (Grid Search)

- https://github.com/chasedehan/BoostARoota

In [23]:
X_train = X_train[['gimnasio','usosmultiples','centroscomercialescercanos','cant_palabras_positivas','cant_areas_dedicadas',\
                   'cant_areas_verdes','cant_areas_entretenimiento_cerca','planta_alta','planta_baja','tiene_bodega',\
                   'comercial','tiene_servicio','edificio','casa','usa_easybroker','tiene_seguridad','tiene_antiguedad',\
                   'antiguedad','tiene_banos','banos','tiene_garages','garages','tiene_habitaciones','habitaciones',\
                   'tiene_metroscubiertos','metroscubiertos','tiene_metrostotales','metrostotales','cant_amenities','year',\
                   'sin_month','sin_day'
                ]]

In [24]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

params = {  
    "n_estimators": [200],
    "max_depth": [20,22,25],
    "learning_rate": [0.06], 
    "colsample_bytree": [1],
    "subsample": [0.946934], 
    "gamma":[30],
    'reg_alpha': [10],
    "min_child_weight": [11]
}

regXGB = xgb.XGBRegressor(objective ='reg:squarederror',nthread=-1) 

regXGBwithCV = GridSearchCV(regXGB, params, n_jobs=-1,verbose=10,cv=3) # n_iters es la cant de veces que busca, 10 es lo default

regXGBwithCV.fit(X_train, y_train, eval_metric="rmse")

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:  3.8min remaining: 13.4min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:  3.8min remaining:  7.7min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  4.1min remaining:  5.2min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:  4.2min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  4.2min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:  4.5min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  6.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  6.6min finished
  if getattr(data, 'base', None) is not None and \


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=-1,
                                    objective='reg:squarederror',
                                    random_stat...
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [1], 'gamma': [30],
                         'learning_rate': [0.06], 'max_depth': [20, 22, 25],
      

In [25]:
print(regXGBwithCV.best_score_)
print(regXGBwithCV.best_params_)

0.6387103697028009
{'colsample_bytree': 1, 'gamma': 30, 'learning_rate': 0.06, 'max_depth': 20, 'min_child_weight': 11, 'n_estimators': 200, 'reg_alpha': 10, 'subsample': 0.946934}


No sirve

# Approach 1.3 - XGBoost con CV y todos los features (categorical encoding 1 - label encoding) (Random Search)

- https://medium.com/@songxia.sophia/two-machine-learning-algorithms-to-predict-xgboost-neural-network-with-entity-embedding-caac68717dea

In [55]:
import re
import time
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import xgboost as xgb
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [71]:
X = train.drop('precio', axis=1) #set de datos
y = train['precio'] #target

In [72]:
encode_columns = ['ciudad', 'tipodepropiedad', 'provincia']

In [73]:
encode_df = X[encode_columns]
encode_df = encode_df.astype('str')
encode_df = encode_df.apply(LabelEncoder().fit_transform)

In [74]:
score_encode_drop = X.drop(encode_columns, axis = 1)
score_encode = pd.concat([score_encode_drop, encode_df], axis = 1)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(score_encode, y, test_size=0.33, random_state=seed)

In [76]:
X_train = X_train.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1)

In [77]:
params = {"colsample_bytree": uniform(0.7, 0.3),
          "gamma": uniform(0, 0.5),
          "learning_rate": uniform(0.003, 0.3), # default 0.1 
          "max_depth": randint(2, 6), # default 3
          "n_estimators": randint(100, 250), # default 100
          "subsample": uniform(0.6, 0.4)}

xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=seed)

time_split = TimeSeriesSplit(n_splits = 8)

xgb_search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=seed,\
                                n_iter=4, cv=time_split, verbose=1, n_jobs=-1, return_train_score=True)

In [78]:
%%time
xgb_search.fit(X_train, y_train, eval_metric="rmse")

Fitting 8 folds for each of 4 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  2.1min finished
  if getattr(data, 'base', None) is not None and \


CPU times: user 36.9 s, sys: 324 ms, total: 37.2 s
Wall time: 2min 40s


RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=8),
                   error_score='raise-deprecating',
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1...
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f27bfc9f828>,
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f27c9150e48>,
  

In [79]:
print(xgb_search.best_score_)
print(xgb_search.best_params_)

0.7770412558172045
{'colsample_bytree': 0.7467983561008608, 'gamma': 0.02904180608409973, 'learning_rate': 0.2628528437324805, 'max_depth': 5, 'n_estimators': 203, 'subsample': 0.8832290311184181}


In [80]:
X_test = X_test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1)

In [81]:
y_pred = xgb_search.predict(X_test)
rms = sqrt(mean_squared_error(y_test, y_pred))
print ('RMSE:', rms)

RMSE: 977271.2515415936


In [82]:
beep()

## Entrenamiento con todos los datos para obtener predicciones a subir

In [83]:
X = train.drop(['id','fecha','titulo', 'descripcion', 'direccion','precio'], axis=1) #set de datos
y = train['precio'] #target

In [84]:
encode_columns = ['ciudad', 'tipodepropiedad', 'provincia']

In [85]:
encode_df = X[encode_columns]
encode_df = encode_df.astype('str')
encode_df = encode_df.apply(LabelEncoder().fit_transform)

In [86]:
score_encode_drop = X.drop(encode_columns, axis = 1)
score_encode = pd.concat([score_encode_drop, encode_df], axis = 1)

In [88]:
%%time
#Entrenar modelo
model = xgb.XGBRegressor(colsample_bytree=0.7467983561008608,gamma=0.02904180608409973,learning_rate=0.2628528437324805,max_depth=5,n_estimators=203,\
                 subsample=0.8832290311184181,nthread=-1,objective ='reg:squarederror')

model.fit(score_encode, y,eval_metric="rmse", verbose=True)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


CPU times: user 56 s, sys: 240 ms, total: 56.2 s
Wall time: 56.2 s


In [97]:
test_to_encode = test.drop(['id','fecha','titulo', 'descripcion', 'direccion'], axis=1)

In [98]:
encode_df_test = test_to_encode[encode_columns]
encode_df_test = encode_df_test.astype('str')
encode_df_test = encode_df_test.apply(LabelEncoder().fit_transform)

In [91]:
score_encode_drop_test = test_to_encode.drop(encode_columns, axis = 1)
score_encode_test = pd.concat([score_encode_drop_test, encode_df_test], axis = 1)

In [92]:
pred = model.predict(score_encode_test)

In [94]:
df = pd.DataFrame(data={'id':test['id'], 'target':pred})

In [95]:
description = "1st approach_full_features. XGBoost"
save_submission(df, description=description)

In [96]:
beep()

***

# Approach 1.4 - XGBoost con CV y todos los features (categorical encoding 2) (Random Search)

- https://medium.com/@songxia.sophia/two-machine-learning-algorithms-to-predict-xgboost-neural-network-with-entity-embedding-caac68717dea

In [84]:
import re
import time
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import xgboost as xgb
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV

In [85]:
X = train.drop('precio', axis=1) #set de datos
y = train['precio'] #target

In [86]:
onehot_columns = ['ciudad', 'tipodepropiedad', 'provincia']

In [87]:
onehot_df = pd.get_dummies(X, columns = onehot_columns)
score_onehot_drop = X.drop(onehot_columns, axis = 1)
score_onehot = pd.concat([score_onehot_drop, onehot_df], axis = 1)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(score_onehot, y, test_size=0.33, random_state=seed)

In [89]:
X_train = X_train.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1)

In [90]:
params = {"colsample_bytree": uniform(0.7, 0.3),
          "gamma": uniform(0, 0.5),
          "learning_rate": uniform(0.003, 0.3), # default 0.1 
          "max_depth": randint(2, 6), # default 3
          "n_estimators": randint(100, 250), # default 100
          "subsample": uniform(0.6, 0.4)}

xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=seed)

time_split = TimeSeriesSplit(n_splits = 8)

xgb_search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=seed,\
                                n_iter=4, cv=time_split, verbose=1, n_jobs=-1, return_train_score=True)

In [91]:
X_train = X_train.loc[:,~X_train.columns.duplicated()]

In [159]:
%%time
xgb_search.fit(X_train, y_train, eval_metric="rmse")

Fitting 8 folds for each of 4 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

## One hot cuelga la pc con las dimensiones, habra que achicarlas

Basado en el approach 2 de Matias en first approaches by rozanecm uso lo siguiente:

In [27]:
X = train.drop('precio', axis=1) #set de datos
y = train['precio'] #target

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [32]:
import xgboost as xgb
from scipy.stats import uniform, randint
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV

params = {"colsample_bytree": uniform(0.7, 0.3),
          "gamma": uniform(0, 0.5),
          "learning_rate": uniform(0.003, 0.3), # default 0.1 
          "max_depth": randint(2, 6), # default 3
          "n_estimators": randint(100, 250), # default 100
          "subsample": uniform(0.6, 0.4)}

xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=seed)

time_split = TimeSeriesSplit(n_splits = 8)

xgb_search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=seed,\
                                n_iter=4, cv=time_split, verbose=1, n_jobs=-1, return_train_score=True)

In [34]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
#                 ,'descripcion'
#                 ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_random_search", xgb_search))

my_pipe = Pipeline(steps, verbose=True)

In [35]:
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.8s
Fitting 8 folds for each of 4 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  3.4min finished
  if getattr(data, 'base', None) is not None and \


[Pipeline]  (step 2 of 2) Processing xgboost_random_search, total= 4.5min
610214.3501041636


In [36]:
beep()

## Entrenamiento con todos los datos para obtener predicciones a subir

In [107]:
del X_train
del X_test
del y_train
del y_test

In [108]:
my_pipe.fit(train.drop(['precio'], axis=1).replace({True:1,False:0}), train['precio'])

# prediciendo valores posta...
predictions = my_pipe.predict(test.replace({True:1,False:0}))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   3.0s
Fitting 8 folds for each of 4 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  4.4min finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[Pipeline]  (step 2 of 2) Processing xgboost_random_search, total= 5.3min


In [109]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [110]:
description = "1st approach_full_features. XGBoost con one hot"
save_submission(df, description=description)

In [111]:
beep()

## Obtengamos predicciones para todas las propiedades en nuestro train set

In [23]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "rozanecm_approach_2"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    my_pipe.fit(X_train2.replace({True:1,False:0}), y_train2)
    y_scores = my_pipe.predict(X_test2.replace({True:1,False:0}))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer, index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   5.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 2.9min
613617.4667506837
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   7.9s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 3.1min
619893.2199777354
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   6.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 2.9min
611674.0503883368


In [27]:
beep()

***

# Approach 1.5 - XGBoost parameter tunning (Grid Search)

- https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [38]:
import re
import time
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import xgboost as xgb
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV

In [39]:
X = train.drop('precio', axis=1) #set de datos
y = train['precio'] #target

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

## Tunning max depth y min child weight

In [54]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=140, max_depth=5,
                             min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1, seed=27)

gsearch1 = GridSearchCV(estimator = xgb_model,param_grid = param_test1,n_jobs=-1,iid=False, cv=5)

In [55]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
#                 ,'descripcion'
#                 ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search1", gsearch1))

my_pipe = Pipeline(steps, verbose=True)

In [56]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  11.0s


  if getattr(data, 'base', None) is not None and \


[Pipeline]  (step 2 of 2) Processing xgboost_grid_search1, total=17.1min
573755.0267781329
CPU times: user 1min 44s, sys: 1.07 s, total: 1min 45s
Wall time: 17min 23s


In [60]:
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 9, 'min_child_weight': 5}, 0.8073939828312875)

In [61]:
beep()

Tunning mas preciso sobre este tunning

In [62]:
param_test2 = {
 'max_depth':[8,9,10],
 'min_child_weight':[4,5,6]
}

gsearch2 = GridSearchCV(estimator = xgb_model,param_grid = param_test2, n_jobs=-1,iid=False, cv=5)

In [63]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search1", gsearch2))

my_pipe = Pipeline(steps, verbose=True)

In [64]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   9.6s


  if getattr(data, 'base', None) is not None and \


[Pipeline]  (step 2 of 2) Processing xgboost_grid_search1, total=19.6min
CPU times: user 1min 56s, sys: 889 ms, total: 1min 57s
Wall time: 19min 44s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [65]:
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 10, 'min_child_weight': 6}, 0.8090391201801971)

In [66]:
beep()

Valores mayores

In [68]:
param_test2b = {
 'max_depth':[9,10,11,12],
 'min_child_weight':[6,8,10,12]
}

gsearch2b = GridSearchCV(estimator = xgb_model,param_grid = param_test2b, n_jobs=-1,iid=False, cv=5)

In [69]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch2b))

my_pipe = Pipeline(steps, verbose=True)

In [70]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   7.2s
[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=39.5min
CPU times: user 1min 57s, sys: 1.32 s, total: 1min 58s
Wall time: 39min 38s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [77]:
gsearch2b.best_params_, gsearch2b.best_score_

({'max_depth': 10, 'min_child_weight': 6}, 0.8090391201801971)

In [72]:
beep()

Entonces 10 y 6 son optimos

## Tunning gamma

In [78]:
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}

xgb_model = xgb.XGBRegressor( learning_rate =0.1, n_estimators=140, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch3 = GridSearchCV(estimator = xgb_model,param_grid = param_test3, n_jobs=-1,iid=False, cv=5)

In [79]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch3))

my_pipe = Pipeline(steps, verbose=True)

In [80]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   4.8s


  if getattr(data, 'base', None) is not None and \


[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=14.0min
CPU times: user 1min 57s, sys: 3.08 s, total: 2min
Wall time: 15min 43s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [81]:
gsearch3.best_params_, gsearch3.best_score_

({'gamma': 0.0}, 0.8090391201801971)

In [82]:
beep()

## Tunning subsample and colsample_bytree

In [94]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

xgb_model = xgb.XGBRegressor( learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch4 = GridSearchCV(estimator = xgb_model,param_grid = param_test4, n_jobs=-1,iid=False, cv=5)

In [95]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch4))

my_pipe = Pipeline(steps, verbose=True)

In [96]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   9.8s
[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=50.1min
CPU times: user 2min 21s, sys: 1.54 s, total: 2min 22s
Wall time: 50min 19s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [97]:
gsearch4.best_params_, gsearch4.best_score_

({'colsample_bytree': 0.7, 'subsample': 0.9}, 0.8117147682581691)

In [98]:
beep()

Mas preciso

In [99]:
param_test5 = {
 'subsample':[i/100.0 for i in range(80,100,5)],
 'colsample_bytree':[i/100.0 for i in range(65,80,5)]
}

xgb_model = xgb.XGBRegressor( learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch5 = GridSearchCV(estimator = xgb_model,param_grid = param_test5, n_jobs=-1,iid=False, cv=5)

In [100]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch5))

my_pipe = Pipeline(steps, verbose=True)

In [101]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  11.2s
[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=35.2min
CPU times: user 2min 17s, sys: 1.44 s, total: 2min 19s
Wall time: 35min 26s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [102]:
gsearch5.best_params_, gsearch5.best_score_

({'colsample_bytree': 0.7, 'subsample': 0.95}, 0.8121743836444513)

In [103]:
beep()

## Tunning reg_alpha

In [104]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch6 = GridSearchCV(estimator = xgb_model,param_grid = param_test6, n_jobs=-1,iid=False, cv=5)

In [105]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch6))

my_pipe = Pipeline(steps, verbose=True)

In [106]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   3.2s
[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=18.6min
CPU times: user 2min 17s, sys: 772 ms, total: 2min 18s
Wall time: 18min 40s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [107]:
gsearch6.best_params_, gsearch6.best_score_

({'reg_alpha': 1}, 0.8123286654703807)

In [108]:
beep()

Mas preciso

In [109]:
param_test7 = {
 'reg_alpha':[0.5, 0.75, 1, 1.25, 1.5, 2]
}

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch7 = GridSearchCV(estimator = xgb_model,param_grid = param_test7, n_jobs=-1,iid=False, cv=5)

In [110]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch7))

my_pipe = Pipeline(steps, verbose=True)

In [111]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  10.0s
[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=18.2min
CPU times: user 2min 17s, sys: 1.67 s, total: 2min 19s
Wall time: 18min 25s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [112]:
gsearch7.best_params_, gsearch7.best_score_

({'reg_alpha': 1.5}, 0.8123413160129532)

In [113]:
beep()

## Tunning n_estimators

In [116]:
param_test8 = {
 'n_estimators':[100, 150, 200, 250, 300]
}

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             reg_alpha=1.5,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch8 = GridSearchCV(estimator = xgb_model,param_grid = param_test8, n_jobs=-1,iid=False, cv=5)

In [117]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch8))

my_pipe = Pipeline(steps, verbose=True)

In [118]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  12.5s
[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=17.4min
CPU times: user 3min 18s, sys: 1.61 s, total: 3min 20s
Wall time: 17min 42s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [119]:
gsearch8.best_params_, gsearch8.best_score_

({'n_estimators': 300}, 0.8136330892288279)

In [120]:
beep()

In [121]:
param_test9 = {
 'n_estimators':[300, 500, 700, 900, 1200]
}

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             reg_alpha=1.5,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch9 = GridSearchCV(estimator = xgb_model,param_grid = param_test9, n_jobs=-1,iid=False, cv=5)

In [122]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch9))

my_pipe = Pipeline(steps, verbose=True)

In [123]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   9.0s
[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=57.2min
CPU times: user 9min 29s, sys: 1.59 s, total: 9min 31s
Wall time: 57min 24s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [124]:
gsearch9.best_params_, gsearch9.best_score_

({'n_estimators': 900}, 0.8151624288480696)

In [125]:
beep()

## Tunning learning_rate

In [130]:
param_test10 = {
 'learning_rate':[0.001, 0.01, 0.1, 1]
}

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             reg_alpha=1.5,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch10 = GridSearchCV(estimator = xgb_model,param_grid = param_test10, n_jobs=-1,iid=False, cv=5)

In [131]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch10))

my_pipe = Pipeline(steps, verbose=True)

In [132]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.9s


  if getattr(data, 'base', None) is not None and \


[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=13.2min
CPU times: user 2min 16s, sys: 1.44 s, total: 2min 17s
Wall time: 13min 17s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [133]:
gsearch10.best_params_, gsearch10.best_score_

({'learning_rate': 0.1}, 0.8123413160129532)

In [134]:
beep()

In [135]:
param_test11 = {
 'learning_rate':[0.075, 0.085, 0.095, 0.1, 0.15]
}

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             reg_alpha=1.5,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

gsearch11 = GridSearchCV(estimator = xgb_model,param_grid = param_test11, n_jobs=-1,iid=False, cv=5)

In [136]:
steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_grid_search", gsearch11))

my_pipe = Pipeline(steps, verbose=True)

In [137]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   4.1s
[Pipeline]  (step 2 of 2) Processing xgboost_grid_search, total=16.9min
CPU times: user 2min 16s, sys: 1.17 s, total: 2min 18s
Wall time: 17min 2s


Pipeline(memory=None,
         steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('small_cat',
                                                  Pipeline(memory=None,
                                                           steps=[('category_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                   

In [138]:
gsearch11.best_params_, gsearch11.best_score_

({'learning_rate': 0.1}, 0.8123413160129532)

In [139]:
beep()

### Entrenamiento local

In [140]:
X = train.drop('precio', axis=1) #set de datos
y = train['precio'] #target

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [142]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             reg_alpha=1.5,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

In [143]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
#                 ,'descripcion'
#                 ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_best_params", xgb_model))

my_pipe = Pipeline(steps, verbose=True)

In [144]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   9.0s
[Pipeline]  (step 2 of 2) Processing xgboost_best_params, total= 2.3min
561393.1192913017
CPU times: user 2min 19s, sys: 1.07 s, total: 2min 20s
Wall time: 2min 29s


## Entrenamiento con todos los datos para obtener predicciones a subir

In [145]:
del X_train
del X_test
del y_train
del y_test

In [146]:
my_pipe.fit(X, axis=1).replace({True:1,False:0}), y)

# prediciendo valores posta...
predictions = my_pipe.predict(test.replace({True:1,False:0}))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   3.2s


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[Pipeline]  (step 2 of 2) Processing xgboost_best_params, total= 3.4min


In [147]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [148]:
description = "1st approach_full_features. XGBoost with tunning parameters"
save_submission(df, description=description)

In [149]:
beep()

## Obtengamos predicciones para todas las propiedades en nuestro train set

In [23]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "rozanecm_approach_2"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    my_pipe.fit(X_train2.replace({True:1,False:0}), y_train2)
    y_scores = my_pipe.predict(X_test2.replace({True:1,False:0}))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer, index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   5.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 2.9min
613617.4667506837
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   7.9s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 3.1min
619893.2199777354
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   6.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 2.9min
611674.0503883368


In [27]:
beep()

***

# Approach 1.6 - XGBoost optimum parameters + log precio

In [26]:
train['precio_log'] = np.log(train['precio'])

In [27]:
X = train.drop(['precio', 'precio_log'], axis=1) #set de datos
y = train['precio_log'] #target

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [29]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             reg_alpha=1.5,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

In [30]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
#                 ,'descripcion'
#                 ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_best_params", xgb_model))

my_pipe = Pipeline(steps, verbose=True)

In [31]:
%%time
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.6s


  if getattr(data, 'base', None) is not None and \


[Pipeline]  (step 2 of 2) Processing xgboost_best_params, total=  32.4s
0.22766361886806274
CPU times: user 4min 16s, sys: 1.62 s, total: 4min 18s
Wall time: 37.6 s


In [32]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

562383.1390865688


# Approach 2: LightGBM optimizado with log price + features desc

## Entrenamiento local

In [None]:
train['precio_log'] = np.log(train['precio'])

In [None]:
X = train.drop(['precio', 'precio_log'], axis=1) #set de datos
y = train['precio_log'] #target

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [None]:
import lightgbm as lgb

gbm_optimized = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                                  importance_type='split', learning_rate=0.05, max_depth=75,
                                  min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                                  n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
                                  random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
                                  subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
gbm_optimized.fit(X_train.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train, sample_weight=None, init_score=None, eval_set=[(X_test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1),y_test)], eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=10,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)

y_scores = gbm_optimized.predict(X_test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

In [None]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

## Entrenamiento con todos los datos para obtener predicciones a subir

In [None]:
gbm_optimized.fit(X.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y, sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

In [None]:
exp_predictions = np.exp(predictions)

In [None]:
df = pd.DataFrame(data={'id':test['id'], 'target':exp_predictions})

In [None]:
description = "2nd approach. LightGBM previous grid search. Log(precio) y features descripcion"
save_submission(df, description=description)

In [None]:
beep()

# Approach 2.1: LightGBM with log price + features desc (grid search con nuevos features)

## Entrenamiento local

In [35]:
train['precio_log'] = np.log(train['precio'])

In [36]:
X = train.drop(['precio', 'precio_log'], axis=1) #set de datos
y = train['precio_log'] #target

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [39]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
gbm = lgb.LGBMRegressor(silent=False)

param_dist = {"boosting_type":['gbdt','dart'],
              "max_depth": [25,50,75],
              "learning_rate" : [0.001,0.01,0.05,0.1],
              "num_leaves": [300,900,1200],
              "n_estimators": [50,100,200],
             }

grid_search = GridSearchCV(gbm, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="neg_mean_absolute_error", verbose=5)
grid_search.fit(X.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y)
grid_search.best_estimator_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 41.8min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 64.5min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 67.3min finished


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=25,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [40]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

gbm_optimized = grid_search.best_estimator_

In [41]:
gbm_optimized.fit(X_train.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train, sample_weight=None, init_score=None, eval_set=[(X_test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1),y_test)], eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=10,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)

y_scores = gbm_optimized.predict(X_test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.21345376777287184


In [42]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

525219.9818594557


## Entrenamiento con todos los datos para obtener predicciones a subir

In [43]:
gbm_optimized.fit(X.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y, sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [44]:
exp_predictions = np.exp(predictions)

In [45]:
df = pd.DataFrame(data={'id':test['id'], 'target':exp_predictions})

In [46]:
description = "2.1 approach. LightGBM with new grid search for new features of desc. Log(precio)"
save_submission(df, description=description)

In [47]:
beep()

# Approach 3: Test promedio LightGBM + XGBoost

In [46]:
predictions_xgboost = pd.read_csv('../predictions/2019.11.27 - 01:34:19 by fcozza.csv')
predictions_lightgbm = pd.read_csv('../predictions/2019.11.27 - 02:14:16 by fcozza.csv')

## Promedio las predicciones

In [47]:
pred = pd.merge(predictions_xgboost,predictions_lightgbm,on='id',how='inner')

In [50]:
pred['target'] = (pred['target_x'] + pred['target_y']) / 2

In [52]:
pred = pred [['id','target']]

In [53]:
description = "3rd approach. Promedio entre xgboost y lightgbm"
save_submission(pred,description=description)

## No mejoró