In [1]:
import os
import sys
import warnings

import pdb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from evolutionary_search import EvolutionaryAlgorithmSearchCV

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import recall_score, make_scorer, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from sklearn import tree

from utils import MySet

from utils import local_data
from utils import window
from utils import Scale, give_error
from utils import generate_and_avaliate_model

from utils import location_station, find_set_sunrise, find_set_sunset

#%matplotlib inline
warnings.filterwarnings('ignore')

latter_size = 14
plt.rcParams['legend.fontsize'] = latter_size 
plt.rcParams['font.size'] = latter_size 
plt.rcParams['axes.labelsize'] = latter_size
plt.rcParams['xtick.labelsize'] = latter_size
plt.rcParams['ytick.labelsize'] = latter_size

In [2]:
df = pd.read_pickle('./data/sj2_analise_update2_drop.pkl')

In [3]:
df.columns

Index(['vtec', 'vtec_dt', 'vtec_dt2', 'gvtec1', 'gvtec1_dt', 'gvtec2',
       'gvtec2_dt', 'gvtec3', 'gvtec3_dt', 's4', 'state_night', 'state_dawn',
       'vm1', 'vd1', 'vm2', 'vd2', 'gvtec1_dt_lag_9', 'gvtec2_dt_lag_20',
       'vtec_dt_lag_3', 'vtec_i/vtec_i-1', 'roti_3', 'roti_5', 'roti_7',
       'roti_9', 'roti_11', 'roti_13', 'gvtec1/gvtec2', 'gvtec1_dt/gvtec2_dt',
       'doy', 'ut', 'discretize_s4', 'discretize_s4_02', 'discretize_s4_03',
       'discretize_s4_04', 'discretize_s4_05', 'discretize_s4_06',
       'discretize_s4_07'],
      dtype='object')

In [4]:
original = MySet('original', ['vtec', 'vtec_dt', 'vtec_dt2', 'gvtec1', 'gvtec1_dt', 'gvtec2',
       'gvtec2_dt', 'gvtec3', 'gvtec3_dt', 'state_night', 'state_dawn', 'vm1', 'vd1', 'vm2', 'vd2', 'gvtec1_dt_lag_9',
       'gvtec2_dt_lag_20', 'vtec_dt_lag_3', 'vtec_i/vtec_i-1', 'roti_3',
       'roti_5', 'roti_7', 'roti_9', 'roti_11', 'roti_13', 'gvtec1/gvtec2',
       'gvtec1_dt/gvtec2_dt', 'doy', 'ut'])

In [5]:
# select data
instance_set = list(original.set)
X = df[instance_set].values
y = df['discretize_s4'].values

recall_inbalanced_score = make_scorer(recall_score, average='macro')

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.30, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)
       
# suffle the train data
order = np.random.permutation(len(X_train))
X_train = np.array([X_train[i] for i in order])
y_train = np.array([y_train[i] for i in order])

mod = XGBClassifier(num_class=7, objective="multi:softmax", metric="mlogloss")
# implement Kfold cross validation
kf = StratifiedKFold(n_splits=10, shuffle=True)
errors = []
conf_matrix = np.zeros((7, 7))
for train_index, test_index in kf.split(X_train, y_train):
    # generate standardize transformation for (x,y)
    X_scaler = StandardScaler() # transformation for X
    X_scaler.fit(X_train[train_index])
        
    mod.fit(X_scaler.transform(X_train[train_index]),
            y_train[train_index])
    
    # use the final model to avaliate the error in a sample of the time series
    X_validate = X_scaler.transform(X_test)
    
    predito = mod.predict(X_validate)
    conf_matrix += confusion_matrix(y_test, predito, labels=[0, 1, 2, 3, 4, 5, 6] )

conf_matrix /= 10.0
print(conf_matrix)

list_values = []
for feature, feat_importance in zip(instance_set, mod.feature_importances_):
    list_values.append((feature, feat_importance))
    print((feature, feat_importance))
    
def get_second(value):
    return value[1]

list_values.sort(key=get_second, reverse=True)

list_values

[[1.4777e+03 1.6600e+01 7.0000e-01 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [6.0800e+01 8.2800e+01 1.3000e+00 1.0000e-01 0.0000e+00 0.0000e+00
  0.0000e+00]
 [1.1500e+01 1.4300e+01 1.3600e+01 6.0000e-01 0.0000e+00 0.0000e+00
  0.0000e+00]
 [1.6000e+00 1.0000e+00 2.4000e+00 4.9000e+00 1.0000e-01 0.0000e+00
  0.0000e+00]
 [0.0000e+00 8.0000e-01 1.0000e-01 1.0000e-01 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]]
('roti_11', 0.021871204)
('roti_5', 0.038072094)
('gvtec3', 0.051032808)
('gvtec1_dt_lag_9', 0.04981774)
('gvtec1_dt/gvtec2_dt', 0.009315512)
('vtec_dt2', 0.022681247)
('gvtec1', 0.02146618)
('vm2', 0.028756581)
('gvtec2_dt_lag_20', 0.07614419)
('state_dawn', 0.021871204)
('gvtec2', 0.05508303)
('gvtec3_dt', 0.05184285)
('vtec_i/vtec_i-1', 0.019036047)
('doy', 0.07857432)
('vm1', 0.03199676)
('vtec_dt', 0.0076954234)
('vt

[('ut', 0.080599435),
 ('doy', 0.07857432),
 ('gvtec2_dt_lag_20', 0.07614419),
 ('vtec', 0.07492912),
 ('gvtec2', 0.05508303),
 ('gvtec2_dt', 0.052652895),
 ('gvtec3_dt', 0.05184285),
 ('gvtec3', 0.051032808),
 ('gvtec1_dt_lag_9', 0.04981774),
 ('vd2', 0.03928716),
 ('roti_5', 0.038072094),
 ('vd1', 0.036452007),
 ('vm1', 0.03199676),
 ('vm2', 0.028756581),
 ('gvtec1_dt', 0.027946537),
 ('roti_7', 0.02308627),
 ('vtec_dt2', 0.022681247),
 ('roti_11', 0.021871204),
 ('state_dawn', 0.021871204),
 ('gvtec1/gvtec2', 0.021871204),
 ('gvtec1', 0.02146618),
 ('roti_9', 0.019846091),
 ('vtec_i/vtec_i-1', 0.019036047),
 ('roti_13', 0.01782098),
 ('vtec_dt_lag_3', 0.010530579),
 ('roti_3', 0.009720535),
 ('gvtec1_dt/gvtec2_dt', 0.009315512),
 ('vtec_dt', 0.0076954234),
 ('state_night', 0.0)]

In [6]:
# select data
instance_set = list(original.set)
X = df[instance_set].values
y = df['discretize_s4_02'].values

recall_inbalanced_score = make_scorer(recall_score, average='macro')

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.30, 
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)
       
# suffle the train data
order = np.random.permutation(len(X_train))
X_train = np.array([X_train[i] for i in order])
y_train = np.array([y_train[i] for i in order])

mod = XGBClassifier(num_class=2, objective="binary:hinge", metric="logloss")
# implement Kfold cross validation
kf = StratifiedKFold(n_splits=10, shuffle=True)
errors = []
conf_matrix = np.zeros((2, 2))
for train_index, test_index in kf.split(X_train, y_train):
    # generate standardize transformation for (x,y)
    X_scaler = StandardScaler() # transformation for X
    X_scaler.fit(X_train[train_index])
        
    mod.fit(X_scaler.transform(X_train[train_index]),
            y_train[train_index])
    
    # use the final model to avaliate the error in a sample of the time series
    X_validate = X_scaler.transform(X_test)
    
    predito = mod.predict(X_validate)
    conf_matrix += confusion_matrix(y_test, predito, labels=[0, 1] )

conf_matrix /= 10.0
print(conf_matrix)

list_values = []
for feature, feat_importance in zip(instance_set, mod.feature_importances_):
    list_values.append((feature, feat_importance))
    print((feature, feat_importance))
    
def get_second(value):
    return value[1]

list_values.sort(key=get_second, reverse=True)

list_values

XGBoostError: b'[19:39:19] src/objective/hinge.cu:50: Check failed: preds.Size() == info.labels_.Size() (7098 vs. 3549) labels are not correctly providedpreds.size=7098, label.size=3549\n\nStack trace returned 10 entries:\n[bt] (0) /var/lib/jupyterhub/anaconda/envs/dscience/lib/libxgboost.so(dmlc::StackTrace()+0x42) [0x7f988c066092]\n[bt] (1) /var/lib/jupyterhub/anaconda/envs/dscience/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7f988c066698]\n[bt] (2) /var/lib/jupyterhub/anaconda/envs/dscience/lib/libxgboost.so(xgboost::obj::HingeObj::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0x3d1) [0x7f988c0e9d11]\n[bt] (3) /var/lib/jupyterhub/anaconda/envs/dscience/lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7f988c073be2]\n[bt] (4) /var/lib/jupyterhub/anaconda/envs/dscience/lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f988c1c79d5]\n[bt] (5) /var/lib/jupyterhub/anaconda/envs/dscience/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f98c4927b10]\n[bt] (6) /var/lib/jupyterhub/anaconda/envs/dscience/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x47e) [0x7f98c49274df]\n[bt] (7) /var/lib/jupyterhub/anaconda/envs/dscience/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x283) [0x7f98c4b3b7c3]\n[bt] (8) /var/lib/jupyterhub/anaconda/envs/dscience/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x8e6f) [0x7f98c4b32e6f]\n[bt] (9) /var/lib/jupyterhub/anaconda/envs/dscience/bin/../lib/libpython3.6m.so.1.0(_PyObject_FastCallDict+0x8b) [0x7f98cba9fa0b]\n\n'