# Selección final de variables

In [1]:
%matplotlib notebook

In [2]:
import os
import json
import numpy as np
import pandas as pd
import datetime as dt

In [3]:
### Realizamos el cambio de directoroi de trabajo al "Directorio Base" que se
current_dir = os.getcwd()
base_path = os.path.dirname(current_dir)

os.chdir(base_path)

In [4]:
import scripts.funciones as funciones



In [5]:
from sklearn import metrics
from sklearn.base import clone
from xgboost import XGBClassifier
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# Carga y preparación de datos

In [6]:
d_ini = dt.datetime(2017,6,1)
d_fin = dt.datetime(2019,8,1) 

In [7]:
%%time

version = 'verFinal'

### params
freq1 = '1D'
freq2 = '3D'
freq3 = '7D'
freq4 = '14D'
freq5 = '30D'
freq6 = '60D'

### Realizamos la lectura de la informacion climatica en el rango de fechas
### especificado, incluye la etiqueta de si ocurre o no un accidente. 
### Posteriormente, en la organizacion de la informacion climatica, lo
### que se hace es agregar las variables con la informacion distribucional
### de las ultimas 5 horas de la info climatica
data = funciones.read_clima_accidentes(d_ini, d_fin, poblado = True)
data_org = funciones.organizar_data_infoClima(data)


### agregamos la informacion relacionada a la cantidad de accidentes ocurridas
### en las ultimas X horas
### Agregar senales
senales = [freq1, freq2, freq3, freq4, freq5, freq6]
d_ini_acc = d_ini - dt.timedelta(days = int(freq6.replace('D', '')))  ### freq mayor
raw_accidentes = funciones.read_accidentes(d_ini_acc, d_fin)
for fresen in senales:
    data_org = funciones.obtener_accidentes_acumulados(data_org, 
                                                        raw_accidentes, 
                                                        freq = fresen)


### Convertimos la bariable de Barrios en variable dummy para ser incluida
### en el modelo
data_org['poblado'] = data_org['BARRIO']
data_org= pd.get_dummies(data_org, columns=['poblado'])

### Relizamos la particion del conjunto de datos en las variables
### explicativas (X) y la variable respuesta (Y)
X = data_org.drop(columns = ['TW','BARRIO','Accidente','summary'])
Y = data_org['Accidente'] 

Wall time: 36.4 s


In [8]:
### Dividimos el conjunto de datos en entrenamiento y validacion
X_train, X_val, Y_train, Y_val = train_test_split(X, 
                                                  Y,
                                                  stratify = Y,
                                                  test_size = 0.2,
                                                  random_state = 42)

In [9]:
%%time
### Realizamos resampling combinando Tomek Links y Random Undersampling

### Tomek Link
tomeklinks = TomekLinks()
X_tom, y_tom = tomeklinks.fit_sample(X_train, Y_train)

### Random Undersampling
rus = RandomUnderSampler(sampling_strategy = 30/70,random_state = 42)
X_train_under, Y_train_under = rus.fit_sample(X_train, Y_train)

Wall time: 6min 22s


In [10]:
### Estandarizacion del conjunto de datos
scaler = StandardScaler()
scaler.fit(X_train_under)

X_train_under_z = pd.DataFrame(scaler.transform(X_train_under), columns = X_train_under.columns)
X_val_z = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)

# Entrenamiento de modelos con las variables seleccionadas por los 3 métodos del Notebook Feature Selection

In [11]:
Classifier1 = XGBClassifier(n_estimators = 300, 
                           max_depth = 2,
                           random_state = 42)

Classifier2 = RandomForestClassifier(bootstrap=False,  
                             criterion='entropy',
                             max_features='auto',
                             n_estimators=500, 
                             max_depth=10,
                             random_state=42,
                             warm_start=False)

Classifier3 = LogisticRegression()

clasificadores = [Classifier1, Classifier2, Classifier3]

In [12]:
file_name = 'analisis_var_relevantes.json'
path = os.path.join(base_path, f'models/verFinal/{file_name}')
with open(path, 'r') as f:
    info_vars = json.load(f)

vars_backward = info_vars['forward']['features']
vars_lasso = info_vars['lasso']['features']
vars_AG = info_vars['AG']['features']

In [15]:
def obtencion_metricas(clasificadores,variables,X,Y,X_val,Y_val):
    
    ROC = []
    PR = []
    precision = []
    recall = []
    fscore = []
    bAccuracY = []

    for clf in clasificadores:
        clf_mod = clone(clf)
        clf_mod.fit(X[variables],Y)

        ### Metricas en validation

        preds_val = clf_mod.predict_proba(X_val[variables])
        labels_val = clf_mod.predict(X_val[variables])

        ROC_mod = metrics.roc_auc_score(Y_val,preds_val[:,1])
        PR_mod = funciones.precision_recall_auc_score(Y_val,preds_val[:,1])
        precision_mod = metrics.precision_score(Y_val,labels_val)
        recall_mod = metrics.recall_score(Y_val,labels_val)
        f1_mod = metrics.f1_score(Y_val,labels_val)
        bAccuracY_mod = metrics.balanced_accuracy_score(Y_val,labels_val)

        ROC.append(ROC_mod)
        PR.append(PR_mod)
        precision.append(precision_mod) 
        recall.append(recall_mod)
        fscore.append(f1_mod)
        bAccuracY.append(bAccuracY_mod)


    print(f'Mean ROC: {np.mean(ROC)}')
    print(f'Mean PR: {np.mean(PR)}')
    print(f'Mean Precision: {np.mean(precision)}')
    print(f'Mean Recall: {np.mean(recall)}')
    print(f'Mean F Score: {np.mean(fscore)}')
    print(f'Mean Balanced Accuracy: {np.mean(bAccuracY)}')
    
    
    return None

### Modelo vars Lasso

In [16]:
obtencion_metricas(clasificadores,vars_lasso,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7757022987898026
Mean PR: 0.05939209824665229
Mean Precision: 0.06630422448416058
Mean Recall: 0.4176272814601345
Mean F Score: 0.11424587305043066
Mean Balanced Accuracy: 0.6536123142806112


### Modelo vars backward

In [17]:
obtencion_metricas(clasificadores,vars_backward,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7784974009479866
Mean PR: 0.05853722797815569
Mean Precision: 0.07134124808884765
Mean Recall: 0.32372718539865514
Mean F Score: 0.11234791913566178
Mean Balanced Accuracy: 0.6213084789940312


### Modelo vars Algoritmo Genetico

In [18]:
obtencion_metricas(clasificadores,vars_AG,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7699717666036646
Mean PR: 0.05848248924610744
Mean Precision: 0.06581857686046362
Mean Recall: 0.4068203650336215
Mean F Score: 0.11324722719044937
Mean Balanced Accuracy: 0.6493277107914829


### Union de las variables de los 3 metodos

In [19]:
vars_union = list(set.union(set(vars_AG),set(vars_backward),set(vars_lasso)))
obtencion_metricas(clasificadores,vars_union,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7754382893212016
Mean PR: 0.05929180649204935
Mean Precision: 0.06647186043821542
Mean Recall: 0.4166666666666667
Mean F Score: 0.11442794780618805
Mean Balanced Accuracy: 0.6534218306979587


### Interseccion de los 3 metodos

In [20]:
vars_intersec = list(set.intersection(set(vars_AG),set(vars_backward),set(vars_lasso)))
obtencion_metricas(clasificadores,vars_intersec,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.74808948430526
Mean PR: 0.04954900217414624
Mean Precision: 0.06816101388965916
Mean Recall: 0.21373679154658984
Mean F Score: 0.0955604698774314
Mean Balanced Accuracy: 0.5764683490420133


### Voto de los  3 metodos 

In [21]:
all_vars = pd.DataFrame(X_train.columns.values, columns = ['vars'])
all_vars['lasso'] = all_vars['vars'].apply(lambda x: 1 if x in vars_lasso else 0)
all_vars['backward'] = all_vars['vars'].apply(lambda x: 1 if x in vars_backward else 0)
all_vars['AG'] = all_vars['vars'].apply(lambda x: 1 if x in vars_AG else 0)

all_vars['Votos'] = all_vars['lasso'] + all_vars['backward'] + all_vars['AG']
vars_voto = list(all_vars[all_vars['Votos']>=2]['vars'].values)

obtencion_metricas(clasificadores,vars_voto,X_train_under_z,Y_train_under,X_val_z,Y_val)

Mean ROC: 0.7775714076139798
Mean PR: 0.0594891099789445
Mean Precision: 0.0681817573505847
Mean Recall: 0.4121037463976946
Mean F Score: 0.11652741769906107
Mean Balanced Accuracy: 0.6528119125609653


# Guardamos los resultados

In [22]:
Resultados = {'lasso':{'num_features':len(vars_lasso),
                       'features':list(vars_lasso)},
              'forward':{'num_features':len(vars_backward),
                       'features':vars_backward},
              'AG':{'num_features':len(vars_AG),
                    'features':vars_AG},
              'union':{'num_features':len(vars_union),
                    'features':vars_union},
              'interseccion':{'num_features':len(vars_intersec),
                    'features':vars_intersec},
              'voto':{'num_features':len(vars_voto),
                    'features':vars_voto}
              }


with open(f'{base_path}/models/{version}/vars_relevantes_final.json','w') as json_file:
    json.dump(Resultados, json_file)