In [1]:
import os
import sys
import warnings

import pdb
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from evolutionary_search import EvolutionaryAlgorithmSearchCV

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import recall_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from sklearn import tree

from utils import MySet

from utils import local_data
from utils import window
from utils import Scale, give_error
from utils import generate_and_avaliate_model

from utils import location_station, find_set_sunrise, find_set_sunset

#%matplotlib inline
warnings.filterwarnings('ignore')

latter_size = 14
plt.rcParams['legend.fontsize'] = latter_size 
plt.rcParams['font.size'] = latter_size 
plt.rcParams['axes.labelsize'] = latter_size
plt.rcParams['xtick.labelsize'] = latter_size
plt.rcParams['ytick.labelsize'] = latter_size

In [2]:
df = pd.read_pickle('./data/sj2_analise_update2_drop.pkl')

In [3]:
df.columns

Index(['vtec', 'vtec_dt', 'vtec_dt2', 'gvtec1', 'gvtec1_dt', 'gvtec2',
       'gvtec2_dt', 'gvtec3', 'gvtec3_dt', 's4', 'state_night', 'state_dawn',
       'vm1', 'vd1', 'vm2', 'vd2', 'gvtec1_dt_lag_9', 'gvtec2_dt_lag_20',
       'vtec_dt_lag_3', 'vtec_i/vtec_i-1', 'roti_3', 'roti_5', 'roti_7',
       'roti_9', 'roti_11', 'roti_13', 'gvtec1/gvtec2', 'gvtec1_dt/gvtec2_dt',
       'doy', 'ut', 'discretize_s4', 'discretize_s4_02', 'discretize_s4_03',
       'discretize_s4_04', 'discretize_s4_05', 'discretize_s4_06',
       'discretize_s4_07'],
      dtype='object')

In [4]:
original = MySet('original', ['vtec', 'vtec_dt', 'vtec_dt2', 'gvtec1', 'gvtec1_dt', 'gvtec2',
       'gvtec2_dt', 'gvtec3', 'gvtec3_dt', 'state_night', 'state_dawn', 'vm1', 'vd1', 'vm2', 'vd2', 'gvtec1_dt_lag_9',
       'gvtec2_dt_lag_20', 'vtec_dt_lag_3', 'vtec_i/vtec_i-1', 'roti_3',
       'roti_5', 'roti_7', 'roti_9', 'roti_11', 'roti_13', 'gvtec1/gvtec2',
       'gvtec1_dt/gvtec2_dt', 'doy', 'ut'])

In [None]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=3000, num=15)]
learning_rate = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3]
max_depth = [int(x) for x in np.linspace(5, 20, num=10)]
subsample = [0.3, 0.5, 0.75, 1.0]
colsample_bytree = [.3, .5, .7, .9, 1.0]
colsample_bylevel = [.3, .5, .7, .9, 1.0]
min_child_weight = [i for i in range(0, 20)]
gamma = [.3, .5, .7, .9, 1.0]
num_class = 7,

param_grid = {'model__n_estimators': n_estimators,
              'model__max_depth': max_depth,
              'model__learning_rate': learning_rate,
              'model__max_depth': max_depth,
              'model__subsample': subsample,
              'model__colsample_bytree': colsample_bytree,
              'model__colsample_bylevel': colsample_bylevel,
              'model__min_child_weight': min_child_weight,
              'model__gamma': gamma,
              'model__num_class': [7,],
              'model__objective': ["multi:softmax",],
              'model__metric': ["mlogloss",]}

In [None]:
# select data
X = df[list(original.set)].values
y = df['discretize_s4'].values

recall_inbalanced_score = make_scorer(recall_score, average='macro')

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.30, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)
       
# suffle the train data
order = np.random.permutation(len(X_train))
X_train = np.array([X_train[i] for i in order])
y_train = np.array([y_train[i] for i in order])
       
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('model', XGBClassifier()))
pipeline = Pipeline(estimators)

In [None]:
clf = EvolutionaryAlgorithmSearchCV(estimator=pipeline,
                                    params=param_grid,
                                    scoring=recall_inbalanced_score,
                                    cv=StratifiedKFold(n_splits=10),
                                    verbose=1,
                                    population_size=500,
                                    gene_mutation_prob=0.10,
                                    gene_crossover_prob=0.5,
                                    tournament_size=3,
                                    generations_number=100,
                                    n_jobs=8)

In [None]:
clf.fit(X_train, y_train)

Types [1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1] and maxint [14, 9, 8, 3, 4, 4, 19, 4, 0, 0, 0] detected
--- Evolve in 13500000 possible combinations ---


In [None]:
#clf = GridSearchCV(estimator=pipeline,
#                   param_grid=param_grid,
#                   cv=StratifiedKFold(n_splits=10),
#                   verbose=2,
#                   n_jobs=-1,
#                   scoring=recall_inbalanced_score)
      
clf.fit(X_train, y_train)
best_parameters_estimator = clf.best_estimator_.get_params()
best_parameters_model = best_parameters_estimator['model'].get_params()

with open('./data/xgboost_search_all_parameters.txt', 'w') as file:
    file.write(str(best_parameters_model))
    file.write(json.dumps(best_parameters_model))