In [1]:
import os
import sys
import warnings

import pdb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from utils import local_data
from utils import window
from utils import Scale, give_error
from utils import generate_and_avaliate_model

from utils import location_station, find_set_sunrise, find_set_sunset

%matplotlib inline
warnings.filterwarnings('ignore')

latter_size = 14
plt.rcParams['legend.fontsize'] = latter_size 
plt.rcParams['font.size'] = latter_size 
plt.rcParams['axes.labelsize'] = latter_size
plt.rcParams['xtick.labelsize'] = latter_size
plt.rcParams['ytick.labelsize'] = latter_size

In [None]:
df = pd.read_pickle('./data/df_vtec_lags.pkl')
df = df.dropna()

In [None]:
model = RandomForestRegressor

errors = []
for i in range(0, 24):
    instances_set = ['lag_'+str(i) for i in range(0,i+1)]
    print('lag_'+str(i))

    # select data
    X = df[instances_set].values
    y = df['s4'].values

    size = len(X)
    last_element = size - size//10

    X_train = X[0:last_element]
    y_train = y[0:last_element] 
    
    # define parameters for grid_search
    n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 180, num=11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]

    param_grid = {'model__n_estimators': n_estimators,
                  'model__max_features': max_features,
                  'model__max_depth': max_depth,
                  'model__min_samples_split': min_samples_split,
                  'model__min_samples_leaf': min_samples_leaf,
                  'model__bootstrap': bootstrap}
       
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('model', model()))
    pipeline = Pipeline(estimators)
    
    clf = GridSearchCV(estimator=pipeline,
                       param_grid=param_grid,
                       cv=10,
                       verbose=2,
                       n_jobs=-1,
                       scoring='neg_mean_squared_error')
      
    clf.fit(X_train, y_train)
    
    X_scaler = StandardScaler()
    X_scaler.fit(X_train)
    X_train = X_scaler.transform(X_train)
    
    print(best_estimator_.get_params()['model'].get_params())
    mod = model(**clf.best_estimator_.get_params()['model'].get_params())
    mod.fit(X_train, y_train)
    
    # use the final model to avaliate the error in a sample of the time series
    X_validate = X_scaler.transform(X[last_element:size+1])
    y_validate = y[last_element:size+1]
    
    index = df.index.values[last_element:size+1]
    df_aux = pd.DataFrame(index=index)
    df_aux['predito'] = mod.predict(X_validate)
    df_aux['real'] = y_validate

    print('Error for the time series sample:')
    dict_error = give_error(df_aux['real'].values, df_aux['predito'].values, cut_value=0.2);
    dict_error['name'] = 'lag_'+str(i)
    errors.append(dict_error)

    # plot the time series predict against the real values
    ax = df_aux.plot(figsize=(18, 8));
    plt.xlabel('UT')

    lat, long = location_station('sj2')
    set_of_sunrise = find_set_sunrise(df_aux, lat, long)
    set_of_sunset = find_set_sunset(df_aux, lat, long)
    for i in set_of_sunrise:
        ax.axvline(x=i, color='y')
    for i in set_of_sunset[0:-1]:
        ax.axvline(x=i, color='r')

    plt.tight_layout()
    plt.show()

lag_0
Fitting 10 folds for each of 4320 candidates, totalling 43200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 29.2min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 41.8min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 60.9min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 88.5min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 120.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 156.7min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 197.2min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 240.6min


In [None]:
df = pd.DataFrame(errors) 
df.index = df['name']
del df['name']
df = df[['tp', 'tn', 'fp', 'fn', 're', 'pod', 'far', 'acc', 'precission', 'recall', 'f1', 'kappa', 'me', 'tse', 'mse']]
pd.set_option('precision', 4)

In [None]:
df