In [1]:
import pandas as pd
import numpy as np

In [2]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
# source: https://gist.github.com/rozanecm/38f2901c592bdffc40726cb0473318cf
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 0.6, freq = 200):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [4]:
train = pd.read_csv('../data/train_full_both_feat.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])
test = pd.read_csv('../data/test_full_both_feat.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])

In [36]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fcozza", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

## Entrenamiento local

In [21]:
seed = 42

In [22]:
train['precio_log'] = np.log(train['precio'])

In [23]:
X = train.drop(['precio', 'precio_log'],axis=1)  #data
y = train['precio_log'] #target

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [25]:
import time

In [26]:
%%time
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
gbm = lgb.LGBMRegressor(silent=False)

param_dist = {"boosting_type":['gbdt','dart'],
              "max_depth": [25,50,75],
              "learning_rate" : [0.001,0.01,0.05,0.1],
              "num_leaves": [300,900,1200],
              "n_estimators": [50,100,200],
             }

grid_search = GridSearchCV(gbm, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="neg_mean_absolute_error", verbose=5)
grid_search.fit(X.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y)
grid_search.best_estimator_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 57.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 91.7min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 136.0min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 140.9min finished


CPU times: user 5min 15s, sys: 2.31 s, total: 5min 17s
Wall time: 2h 21min 42s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=25,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [27]:
beep()

In [28]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

gbm_optimized = grid_search.best_estimator_

In [29]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

gbm_optimized = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                                  importance_type='split', learning_rate=0.05, max_depth=75,
                                  min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                                  n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
                                  random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
                                  subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [30]:
gbm_optimized.fit(X_train.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train, sample_weight=None, init_score=None, eval_set=[(X_test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1),y_test)], eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=10,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)

y_scores = gbm_optimized.predict(X_test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

0.2133081546188964


In [31]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

525463.7292527702


## Entrenamiento con todos los datos para obtener predicciones a subir

In [32]:
gbm_optimized.fit(train.drop(['id','fecha','titulo', 'descripcion', 'direccion','precio','precio_log'],axis=1), train['precio_log'], sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [33]:
exp_predictions = np.exp(predictions)

In [34]:
df = pd.DataFrame(data={'id':test['id'], 'target':exp_predictions})

In [37]:
description = "Testing final approachs. LightGBM previous grid search. Log(precio). Todos los features"
save_submission(df, description=description)

In [None]:
beep()

## Obtengamos prediccionse para todas las propiedades en nuestro train set

In [38]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_approach_6"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio','precio_log'],axis=1).iloc[train_index], train.drop(['precio','precio_log'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio_log'][train_index], train['precio_log'][test_index]
    
    gbm_optimized.fit(X_train2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train2)
    y_scores = gbm_optimized.predict(X_test2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1))
    
    print(mean_absolute_error(y_test2, y_scores))
    print(mean_absolute_error(np.exp(y_test2), np.exp(y_scores)))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:np.exp(y_scores)}))

df.to_csv("../predictions/on_train_data/" + approach_numer, index=False, header=True)

0.21344034886084887
526546.1138324518
0.21372965050456424
530663.7303829596
0.21377262890394244
526174.405023171


In [None]:
beep()