In [1]:
import os
import sys
import warnings
import pprint

import pdb
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
from evolutionary_search import EvolutionaryAlgorithmSearchCV

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import recall_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from sklearn import tree

from utils import MySet

from utils import local_data
from utils import window
from utils import Scale, give_error
from utils import generate_and_avaliate_model

from utils import location_station, find_set_sunrise, find_set_sunset

#%matplotlib inline
warnings.filterwarnings('ignore')

latter_size = 14
plt.rcParams['legend.fontsize'] = latter_size 
plt.rcParams['font.size'] = latter_size 
plt.rcParams['axes.labelsize'] = latter_size
plt.rcParams['xtick.labelsize'] = latter_size
plt.rcParams['ytick.labelsize'] = latter_size

In [2]:
df = pd.read_pickle('./data/sj2_analise_update2_drop.pkl')

In [3]:
df.columns

Index(['vtec', 'vtec_dt', 'vtec_dt2', 'gvtec1', 'gvtec1_dt', 'gvtec2',
       'gvtec2_dt', 'gvtec3', 'gvtec3_dt', 's4', 'state_night', 'state_dawn',
       'vm1', 'vd1', 'vm2', 'vd2', 'gvtec1_dt_lag_9', 'gvtec2_dt_lag_20',
       'vtec_dt_lag_3', 'vtec_i/vtec_i-1', 'roti_3', 'roti_5', 'roti_7',
       'roti_9', 'roti_11', 'roti_13', 'gvtec1/gvtec2', 'gvtec1_dt/gvtec2_dt',
       'doy', 'ut', 'discretize_s4', 'discretize_s4_02', 'discretize_s4_03',
       'discretize_s4_04', 'discretize_s4_05', 'discretize_s4_06',
       'discretize_s4_07'],
      dtype='object')

In [4]:
instance_set = ['vtec', 'vtec_dt', 'vtec_dt2', 'gvtec1', 'gvtec1_dt', 'gvtec2',
       'gvtec2_dt', 'gvtec3', 'gvtec3_dt', 'state_night', 'state_dawn', 'vm1', 'vd1', 'vm2', 'vd2', 'gvtec1_dt_lag_9',
       'gvtec2_dt_lag_20', 'vtec_dt_lag_3', 'vtec_i/vtec_i-1', 'roti_3',
       'roti_5', 'roti_7', 'roti_9', 'roti_11', 'roti_13', 'gvtec1/gvtec2',
       'gvtec1_dt/gvtec2_dt', 'doy', 'ut']
X = df[instance_set]
y = df['discretize_s4']
y_02 = df['discretize_s4_02']

In [5]:
list_values = []
for feature, feat_importance in zip(instance_set, f_classif(X, y)[0]):
    list_values.append((feature, feat_importance))
    
def get_second(value):
    return value[1]

list_values.sort(key=get_second, reverse=True)
list_values

[('roti_7', 325.7315317090369),
 ('roti_9', 323.60447094306215),
 ('roti_11', 309.2890513883844),
 ('roti_5', 309.21338059410175),
 ('roti_13', 291.8123150966424),
 ('roti_3', 273.9426519331049),
 ('vtec', 227.36301319964252),
 ('state_dawn', 211.04208045223467),
 ('gvtec3', 116.80024647265309),
 ('gvtec1', 83.61454229934513),
 ('vm2', 53.15573302833452),
 ('vm1', 46.841509939300956),
 ('gvtec2_dt', 43.9584035004517),
 ('gvtec3_dt', 32.84418631024842),
 ('gvtec2_dt_lag_20', 32.78575595976093),
 ('gvtec1_dt', 23.71147165585604),
 ('vtec_dt2', 20.891143490217956),
 ('gvtec2', 16.40297659519498),
 ('doy', 15.596739424227643),
 ('vd2', 15.066579158692512),
 ('vd1', 12.273279053429878),
 ('gvtec1_dt_lag_9', 11.711179219412795),
 ('state_night', 7.203374628928005),
 ('vtec_dt', 5.906751356809582),
 ('vtec_dt_lag_3', 5.133443211012448),
 ('ut', 2.857059356830323),
 ('gvtec1/gvtec2', 0.3699949328225962),
 ('vtec_i/vtec_i-1', 0.31086050526018355),
 ('gvtec1_dt/gvtec2_dt', 0.15942603638325822)]

In [6]:
list_values = []
for feature, feat_importance in zip(instance_set, mutual_info_classif(X, y)):
    list_values.append((feature, feat_importance))
    
def get_second(value):
    return value[1]

list_values.sort(key=get_second, reverse=True)
list_values

[('ut', 0.12979874049186968),
 ('vm1', 0.09388257548241707),
 ('vtec', 0.09340969648486208),
 ('vm2', 0.08862631485431183),
 ('vd2', 0.08694139319567817),
 ('roti_13', 0.0829668911880832),
 ('vd1', 0.08157121504044396),
 ('state_dawn', 0.07966430018746995),
 ('roti_7', 0.07842818739059965),
 ('roti_11', 0.07799649817203425),
 ('roti_9', 0.07783080443672508),
 ('roti_5', 0.07374194022705316),
 ('vtec_dt2', 0.05765243451342439),
 ('roti_3', 0.05511304449435572),
 ('gvtec1', 0.04989688552147831),
 ('gvtec1_dt', 0.04510384261076572),
 ('gvtec3_dt', 0.04470271699894046),
 ('gvtec3', 0.04138274991805635),
 ('gvtec2_dt', 0.04094602137781811),
 ('gvtec2', 0.039294519900262115),
 ('vtec_dt', 0.03691028345242664),
 ('vtec_dt_lag_3', 0.03357827133457514),
 ('gvtec1_dt_lag_9', 0.03309170837160669),
 ('doy', 0.031132399110235687),
 ('gvtec1/gvtec2', 0.018121443187602004),
 ('gvtec2_dt_lag_20', 0.01712913136429961),
 ('vtec_i/vtec_i-1', 0.013393697475643895),
 ('gvtec1_dt/gvtec2_dt', 0.0024801282297

In [7]:
clf = ExtraTreesClassifier(n_estimators=1500)
clf = clf.fit(X, y)
list_values = []
for feature, feat_importance in zip(instance_set, clf.feature_importances_):
    list_values.append((feature, feat_importance))
    
def get_second(value):
    return value[1]

list_values.sort(key=get_second, reverse=True)
list_values

[('ut', 0.06954003688076411),
 ('vtec', 0.05638201134366991),
 ('state_dawn', 0.054925581115404715),
 ('gvtec2', 0.045187656509418306),
 ('gvtec2_dt_lag_20', 0.04155247247804275),
 ('doy', 0.04133808644484073),
 ('vm1', 0.03897586048142016),
 ('vd1', 0.03874551073160718),
 ('vm2', 0.0377645808210562),
 ('gvtec3', 0.03773006511203873),
 ('vd2', 0.0372121501244612),
 ('roti_13', 0.03583426791960907),
 ('roti_11', 0.03510263907333308),
 ('gvtec2_dt', 0.034565237205532104),
 ('gvtec1', 0.033654050880267766),
 ('roti_9', 0.03267593422934672),
 ('vtec_dt2', 0.02981369509143521),
 ('roti_7', 0.029618643895448306),
 ('gvtec3_dt', 0.029478177467156367),
 ('gvtec1_dt_lag_9', 0.029452740976723058),
 ('gvtec1_dt', 0.028954152483726063),
 ('roti_5', 0.028143826130771547),
 ('vtec_dt', 0.025299161066362548),
 ('roti_3', 0.025244970099644784),
 ('vtec_dt_lag_3', 0.024718671319152424),
 ('vtec_i/vtec_i-1', 0.022417180960545464),
 ('gvtec1/gvtec2', 0.022085856391162426),
 ('gvtec1_dt/gvtec2_dt', 0.0190

In [8]:
list_values = []
for feature, feat_importance in zip(instance_set, f_classif(X, y_02)[0]):
    list_values.append((feature, feat_importance))
    
def get_second(value):
    return value[1]

list_values.sort(key=get_second, reverse=True)
list_values

[('roti_7', 977.6504209996854),
 ('roti_9', 976.0161728971974),
 ('roti_11', 935.5874450672201),
 ('roti_5', 926.279402911366),
 ('roti_13', 880.3120957474953),
 ('vtec', 876.9023583016897),
 ('state_dawn', 843.6958979700449),
 ('roti_3', 825.0253731079194),
 ('gvtec3', 406.14954546072596),
 ('gvtec1', 277.22028886272346),
 ('vm2', 199.44475844313595),
 ('vm1', 180.20749876266166),
 ('gvtec2_dt', 67.2810915180356),
 ('vtec_dt2', 45.52297564544421),
 ('gvtec1_dt', 31.180492461984638),
 ('vd1', 28.635350208482198),
 ('gvtec3_dt', 28.123204765055924),
 ('state_night', 25.483548153426707),
 ('gvtec2', 24.46640489027514),
 ('vtec_dt', 8.363988332198902),
 ('doy', 7.728248131174271),
 ('ut', 6.941002807259078),
 ('gvtec2_dt_lag_20', 1.5650025681322743),
 ('gvtec1_dt_lag_9', 1.1455521697029458),
 ('vtec_i/vtec_i-1', 1.0686292793496386),
 ('vd2', 0.5406384202579524),
 ('gvtec1/gvtec2', 0.3170342825058654),
 ('gvtec1_dt/gvtec2_dt', 0.047453311812210464),
 ('vtec_dt_lag_3', 0.0004739694169829366

In [9]:
list_values = []
for feature, feat_importance in zip(instance_set, mutual_info_classif(X, y_02)):
    list_values.append((feature, feat_importance))
    
def get_second(value):
    return value[1]

list_values.sort(key=get_second, reverse=True)
list_values

[('ut', 0.12260575941353768),
 ('vtec', 0.08130622893895656),
 ('state_dawn', 0.08047500930304707),
 ('roti_7', 0.07357764092363084),
 ('roti_13', 0.07015456206494419),
 ('roti_9', 0.06992326655616443),
 ('vd1', 0.06987765572123505),
 ('vm1', 0.06819104132540654),
 ('roti_5', 0.06784387966953354),
 ('vm2', 0.06723925893183758),
 ('roti_11', 0.06487189069563115),
 ('vd2', 0.061888827458254525),
 ('vtec_dt2', 0.052488220370088934),
 ('roti_3', 0.05154671645514308),
 ('gvtec1', 0.04322840908698833),
 ('gvtec1_dt', 0.03980419288739623),
 ('gvtec2_dt', 0.03893003933510486),
 ('gvtec3', 0.0384954938423443),
 ('gvtec3_dt', 0.03845259355046404),
 ('vtec_dt', 0.02998284431012177),
 ('gvtec2', 0.02994557307190915),
 ('vtec_dt_lag_3', 0.026044911536033943),
 ('gvtec1_dt_lag_9', 0.025395702719103408),
 ('doy', 0.02106166392203268),
 ('gvtec1/gvtec2', 0.01956974674840839),
 ('vtec_i/vtec_i-1', 0.009608798524247009),
 ('gvtec2_dt_lag_20', 0.007193129494839523),
 ('gvtec1_dt/gvtec2_dt', 0.00070494751

In [10]:
clf = ExtraTreesClassifier(n_estimators=1500)
clf = clf.fit(X, y_02)
list_values = []
for feature, feat_importance in zip(instance_set, clf.feature_importances_):
    list_values.append((feature, feat_importance))
    
def get_second(value):
    return value[1]

list_values.sort(key=get_second, reverse=True)
list_values

[('ut', 0.07691010722660797),
 ('state_dawn', 0.07033112104875351),
 ('vtec', 0.06176795913918204),
 ('gvtec2', 0.045976409185084395),
 ('doy', 0.04119562504145503),
 ('vm2', 0.0403340911694182),
 ('vm1', 0.04030963295869502),
 ('gvtec2_dt_lag_20', 0.04016289892113833),
 ('vd1', 0.037808714461720724),
 ('gvtec3', 0.03588675644306747),
 ('roti_13', 0.035276530468677854),
 ('vd2', 0.03522649015883022),
 ('roti_11', 0.03411560530176702),
 ('roti_9', 0.03318137967874845),
 ('gvtec1', 0.03156315892658043),
 ('gvtec2_dt', 0.030631935828002416),
 ('roti_7', 0.030614632685897106),
 ('roti_5', 0.0291413556565589),
 ('vtec_dt2', 0.02786915148517794),
 ('gvtec3_dt', 0.027795195207966382),
 ('gvtec1_dt', 0.02764942939664423),
 ('roti_3', 0.02552843891141347),
 ('gvtec1_dt_lag_9', 0.02478322506624003),
 ('vtec_dt', 0.022647133183193533),
 ('vtec_dt_lag_3', 0.022100591185604535),
 ('vtec_i/vtec_i-1', 0.019183486315710173),
 ('gvtec1/gvtec2', 0.019039418852771536),
 ('state_night', 0.0166142958592124