In [1]:
import os
import sys
import warnings

import pdb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn import tree

from utils import MySet

from utils import local_data
from utils import window
from utils import Scale, give_error
from utils import generate_and_avaliate_model

from utils import location_station, find_set_sunrise, find_set_sunset

%matplotlib inline
warnings.filterwarnings('ignore')

latter_size = 14
plt.rcParams['legend.fontsize'] = latter_size 
plt.rcParams['font.size'] = latter_size 
plt.rcParams['axes.labelsize'] = latter_size
plt.rcParams['xtick.labelsize'] = latter_size
plt.rcParams['ytick.labelsize'] = latter_size

In [2]:
df = pd.read_pickle('./data/sj2_analise_update.pkl')

In [3]:
df.columns

Index(['vtec', 'vtec_dt', 'vtec_dt2', 'gvtec1', 'gvtec1_dt', 'gvtec2',
       'gvtec2_dt', 'state_night', 'state_dawn', 'vm1', 'vd1', 'vm2', 'vd2',
       'gvtec1_dt_lag_9', 'gvtec2_dt_lag_20', 'vtec_dt_lag_3', 's4'],
      dtype='object')

In [4]:
original = MySet('original', ['vtec', 'vtec_dt', 'vtec_dt2', 'gvtec1', 'gvtec1_dt', 'gvtec2', 'gvtec2_dt'])
tempo  = MySet('tempo', ['state_night', 'state_dawn'])
mdv1 = MySet('mdv1', ['vm1', 'vd1'])
mdv2 = MySet('mdv2', ['vm2', 'vd2'])
lag = MySet('lag', ['gvtec1_dt_lag_9', 'gvtec2_dt_lag_20'])

vtec = MySet('vtec', ['vtec'])
vtec_dt = MySet('vtec_dt', ['vtec_dt'])
vtec_dt2 = MySet('vtec_dt2', ['vtec_dt2'])
gvtec1 = MySet('gvtec1', ['gvtec1'])
gvtec1_dt = MySet('gvtec1_dt', ['gvtec1_dt'])
gvtec2 = MySet('gvtec2', ['gvtec2'])
gvtec2_dt = MySet('gvtec2_dt', ['gvtec2_dt'])
state_night = MySet('state_night', ['state_night'])
state_dawn = MySet('state_dawn', ['state_dawn'])
vm1 = MySet('vm1', ['vm1'])
vd1 = MySet('vd1', ['vd1'])
vm2 = MySet('vm2', ['vm2'])
vd2 = MySet('vd2', ['vd2'])
gvtec1_dt_lag_9 = MySet('gvtec1_dt_lag_9', ['gvtec1_dt_lag_9'])
gvtec2_dt_lag_20 = MySet('gvtec2_dt_lag_20', ['gvtec2_dt_lag_20'])

In [5]:
set_test = vtec+tempo+lag+mdv1+mdv2

In [6]:
instances_set = list(set_test.set)

In [7]:
model = RandomForestRegressor

# define parameters for grid_search
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 180, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

param_grid = {'model__n_estimators': n_estimators,
              'model__max_features': max_features,
              'model__max_depth': max_depth,
              'model__min_samples_split': min_samples_split,
              'model__min_samples_leaf': min_samples_leaf,
              'model__bootstrap': bootstrap}

# select data
X = df[instances_set].values
y = df['s4'].values

size = len(X)
last_element = size - size//10

X_train = X[0:last_element]
y_train = y[0:last_element] 
       
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('model', model()))
pipeline = Pipeline(estimators)
    
clf = GridSearchCV(estimator=pipeline,
                   param_grid=param_grid,
                   cv=10,
                   verbose=2,
                   n_jobs=-1,
                   scoring='neg_mean_squared_error')
      
clf.fit(X_train, y_train)

Fitting 10 folds for each of 4320 candidates, totalling 43200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 27.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 41.8min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 49.6min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 66.8min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 101.6min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 122.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 156.3min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 192.5min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 229.2min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 269.8min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 320.9min
[Parallel(n_jobs=-1)]: Done 9097 tasks  

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impur...='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'model__max_features': ['auto', 'sqrt'], 'model__max_depth': [10, 27, 44, 61, 78, 95, 112, 129, 146, 163, 180, None], 'model__min_samples_split': [2, 5, 10], 'model__min_samples_leaf': [1, 2, 4], 'model__bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=2)

In [9]:
best_parameters = clf.best_estimator_.get_params()
print(type(clf.best_params_))
print(clf.best_params_)

<class 'dict'>
{'model__bootstrap': True, 'model__max_depth': 146, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'model__n_estimators': 200}


In [18]:
best_parameters_model = best_parameters['model'].get_params()


In [19]:
RandomForestRegressor(**best_parameters['model'].get_params())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=146,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [21]:
import pickle
with open("./data/best_parameters_model_rf_complete_fase1.pkl","wb") as file:
    pickle.dump(best_parameters_model, file)