In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import lightgbm as lgb
from funs import rmse, clip20, clip40
import pickle
from sklearn.metrics import mean_squared_error

In [8]:
X_train = pd.read_parquet('../data/X_train.parquet')
X_val = pd.read_parquet('../data/X_val.parquet')
y_train = np.load('../data/y_train.npy')
y_val = np.load('../data/y_val.npy')

In [9]:
X_train.shape

(2389620, 16)

In [10]:
def rmse(*args):
    
    """ Funcion that calculates the root mean squared error"""
    return np.sqrt(mean_squared_error(*args))

def clip20(x):
    return np.clip(x, 0, 20)

def clip40(x):
    return np.clip(x, 0, 20)

### 1) LGBM

In [14]:
learning_rates = [0.03, 0.04, 0.05, 0.06, 0.07]
best_rmse = 9999999999999
for lr in learning_rates:
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=clip40(y_train)), int(100 * (lr / 0.03)))
    pred_lgb_val = lgb_model.predict(X_val)
    score = rmse(clip20(y_val), clip20(pred_lgb_val))

    if score < best_rmse:
        best_rmse = score
        best_lr = lr
        best_lgb = lgb_model

In [15]:
best_lr

0.04

In [16]:
# We train the best model with all the data.

X = X_train.append(X_val)
y = np.append(y_train, y_val)

In [17]:
best_lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': best_lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }
best_lgb = lgb.train(lgb_params, lgb.Dataset(X, label=clip40(y)), int(100 * (lr / 0.03)))

In [18]:
filename = '../Example_Kernels/best_lgb.sav'
pickle.dump(best_lgb, open(filename, 'wb'))

### 3) Random Forest

In [19]:
X = X_train.append(X_val)
Y = np.concatenate([y_train, y_val])
train_ind=np.zeros(X.shape[0])
for i in range(0, len(X_train)):
    train_ind[i] = -1
ps = PredefinedSplit(test_fold=(train_ind))

In [25]:
param_grid={'bootstrap':[0.7, 0.8], 'max_features':[4, 6, 8], 
            'max_depth' : [2, 4, 6, 8]}
gs = GridSearchCV(cv = ps, 
                  estimator = RandomForestRegressor(n_estimators=300, n_jobs=-1), 
                  param_grid=param_grid, scoring='neg_mean_squared_error')


In [30]:
gs.fit(X, clip40(Y))
best_rf = gs.best_estimator_
best_rf

RandomForestRegressor(bootstrap=0.8, criterion='mse', max_depth=8,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [31]:
best_rf.fit(X, clip40(y))

RandomForestRegressor(bootstrap=0.8, criterion='mse', max_depth=8,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [32]:
filename = '../Example_Kernels/best_rf.sav'
pickle.dump(best_rf, open(filename, 'wb'))