# Solucion tarea 3 - Aprendizaje Supervisado

In [1]:
%matplotlib notebook

In [2]:
import os
import time
import sqlite3
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [5]:
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV

In [6]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score
from sklearn.metrics import balanced_accuracy_score

In [7]:
import logging
from logging.handlers import RotatingFileHandler

file_name = 'proc_models'
logger = logging.getLogger()
dir_log = f'data/logs/{file_name}.log'

handler = RotatingFileHandler(dir_log, maxBytes=2000000, backupCount=10)
logging.basicConfig(level=logging.DEBUG,
                    format="%(asctime)s - %(process)d - %(name)s - %(levelname)s - %(message)s",
                    handlers = [handler])

### Importación y preparación de los datos

In [8]:
conn = sqlite3.connect('data/data_accidentes.sqlite3')

d_ini = dt.datetime(2017,6,1)
d_fin = dt.datetime(2018,1,1)

query = f""" SELECT * FROM
            info
            WHERE
            TW >= '{d_ini}' AND
            TW < '{d_fin}' AND
            BARRIO = 'AguasFrias'
            """
            
data = pd.read_sql_query(query, conn)
data['TW'] = pd.to_datetime(data['TW'])

In [9]:
data['Accidente'].sum()

11.0

In [10]:
### Agregar otras features
data['hora'] = data['TW'].dt.hour
data['dia_sem'] = data['TW'].dt.dayofweek

data= pd.get_dummies(data, columns=['hora'])
data= pd.get_dummies(data, columns=['icon'])
data= pd.get_dummies(data, columns=['dia_sem'])

In [11]:
### Feature augmentation
freq = '5H'
variables = ['temperature','precipIntensity','apparentTemperature','dewPoint',
             'humidity','windSpeed','cloudCover','visibility']

data_aux = data.copy()
data_aux.index = data_aux.TW
data_aux = data_aux.sort_index()
data_aux = data_aux.drop(columns = 'TW')
resample_data = data_aux[variables].rolling(freq, closed = 'left').mean()

data_pivot = data_aux.pivot_table(values=variables, index='TW',columns='BARRIO', aggfunc=sum)
data_mean = data_pivot.rolling(freq, closed = 'left').mean().stack().reset_index(drop = False)

col_means = [*data_mean.columns[:2]]
for col in data_mean.columns[2:]:
    col_means.append(col + '_mean')
    
data_mean.columns = col_means

data = data.merge(data_mean, how = 'left', on = ['TW','BARRIO'])
data = data.dropna().reset_index(drop = True)

In [12]:
data_train = data[data['TW']<dt.datetime(2017,11,1)].reset_index(drop = True)
data_test = data[data['TW']>=dt.datetime(2017,11,1)].reset_index(drop = True)

In [13]:
X = data_train.drop("Accidente", 1).reset_index(drop=True)       # feature matrix 
y = data_train['Accidente'].reset_index(drop=True)               # target feature
X = X[X.columns[2:]]

In [14]:
X_test = data_test.drop("Accidente", 1).reset_index(drop=True)       # feature matrix 
y_test = data_test['Accidente'].reset_index(drop=True)               # target feature
X_test = X_test[X_test.columns[2:]]

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2)

In [16]:
tra_0 = int(len(y_train) - y_train.sum())
tra_1 = int(y_train.sum())

prop_deseada_under = 0.5
mul_updown = (tra_0 * prop_deseada_under - tra_1 * (1 - prop_deseada_under)) / (tra_0 * prop_deseada_under)   
fac_1 = int(tra_0 * (1 - mul_updown))

ratio_u = {0 : fac_1, 1 : tra_1}
rus = RandomUnderSampler(sampling_strategy = ratio_u, random_state=42)
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)

In [17]:
X_train_under.shape

(14, 56)

### Entrenamiento y selección de modelos

In [18]:
np.random.seed(42)

layers_nn = []

layer_lim_max = 7
layer_lim_min = 4

nodes_lim_max = 128
nodes_lim_min = 6

iter_max = 5

for _ in range(iter_max):
    for size in range(layer_lim_min, layer_lim_max + 1, 2):
        vec = tuple(np.random.randint(nodes_lim_min, nodes_lim_max, size))
        layers_nn.append(vec)

print(layers_nn)

[(108, 57, 98, 20), (112, 77, 66, 26, 108, 127), (88, 92, 80, 80), (93, 122, 105, 109, 29, 8), (27, 58, 7, 93), (113, 35, 43, 7, 69, 65), (26, 38, 81, 63), (27, 113, 94, 54, 96, 64), (47, 97, 65, 85), (20, 67, 67, 52, 67, 56)]


In [19]:
models = {
             'logistic':{
                         'mod':LogisticRegression(random_state = 42),
                         'par':{
                             'penalty': ('l1','l2'),
                             'solver': ('saga','lbfgs')
                             
                         }
             },
             'ridge_log':{
                         'mod':RidgeClassifier(random_state = 42),
                         'par':{
                              'alpha':[0.2, 0.4, 0.6, 0.8, 1],
                              'solver': ('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga')
                         }
                     
             },
             'naiveBayes':{
                         'mod':GaussianNB(),
                         'par':{}
                     
             },
             'bernoulli':{
                         'mod':BernoulliNB(),
                         'par':{
                             'fit_prior':[True, False],
                             'alpha': [0,0.2,0.4,0.6,0.8,1]
                         }
                     
             },
             'qda':{
                         'mod':QuadraticDiscriminantAnalysis(),
                         'par':{
                             'reg_param':[0,0.3,0.5,0.7,0.9]
                         }
                     
             },
             'nn':{
                         'mod' : MLPClassifier( solver = 'adam',shuffle = True, random_state= 42),
                         'par':{
                             'hidden_layer_sizes' : layers_nn,
                             'activation' : ('logistic', 'relu','tanh','identity'),
                             'learning_rate_init': [0.001,0.01,0.1,0.3,0.5,0.9],
                             'alpha':[0.05, 0.1, 0.5 , 3, 5, 10, 20]
                             }
                     
             },
             'rforest':{
                        'mod': RandomForestClassifier(random_state= 42),
                        'par': {'n_estimators':[10,20,30,40,50,60,70,80,90,100,200,300,400,500],
                                'max_depth': [None, 2, 4, 6, 8, 10, 20, 30],
                                'criterion':('gini','entropy'),
                                'bootstrap': [True,False]
                               }
             },
             'xtree':{
                        'mod': ExtraTreesClassifier(random_state = 42),
                        'par': {'n_estimators':[10,20,30,40,50,60,70,80,90,100,200,300,400,500],
                                'max_depth':[None, 2, 4, 6, 8, 10, 20, 30],
                                'criterion':('gini','entropy'),
                                'bootstrap': [True,False]}                     
             },
             'gradient':{
                         'mod' : GradientBoostingClassifier(random_state = 42),
                         'par' : {'loss' : ('deviance', 'exponential'),
                                 'n_estimators': [10,20,30,40,50,60,70,80,90,100,200,300,400,500],
                                 'max_depth' : [3, 4, 5, 6, 7, 8, 9],
                                 'learning_rate':[0.1,0.3,0.5,0.7,0.9]
                                 }
             },
             'xgboost':{
                      'mod':XGBClassifier(random_state = 42),
                      'par':{
                           'n_estimators':[10,20,30,40,50,60,70,80,90,100,200,300,400,500],
                           'max_depth': [ 2, 4, 6, 8, 10, 20, 30]
                          }
                      }
         }

In [20]:
models = {
              'knn':{
                        'mod': KNeighborsClassifier(),
                        'par': {'n_neighbors':[1,3,5,7,10,20,30,50,100,200,300],
                                'weights':('uniform','distance')
                               }
                        },
                'SVM':
                    {'mod':SVC(tol=0.0001),
                         'par':{
                            'kernel' : ('linear','poly','rbf','sigmoid')
                              }
                     }
             
         }

In [21]:
def grid(base_path, now_date, path_file, X, Y, models, score = 'roc_auc', cv = 2, n_proc = 2, random = False, n_iter = 10):    
    
    for name in models:

        t_ini = time.time()

        pipeline = Pipeline([('scaler', StandardScaler()), (name,  models[name]['mod'])])
        parameters = {}          
        for par in models[name]['par']:
            aux = name + '__' +  par
            parameters[aux] = models[name]['par'][par]
        
        if random:
            mod_aux = RandomizedSearchCV(pipeline, parameters, n_jobs = n_proc,\
                              scoring = score, verbose=1, cv = cv, n_iter = n_iter)
        else:
            mod_aux = GridSearchCV(pipeline, parameters, n_jobs = n_proc,\
                              scoring = score, verbose=1, cv = cv)
                
        mod_aux.fit(X, Y)
        models[name]['bestModel'] = mod_aux.best_estimator_
        models[name]['roc'] = mod_aux.best_score_

        selection_time = time.time() - t_ini

        models[name]['selection_time'] = selection_time

        sample_f_path = os.path.join(base_path, path_file, f'{name}_{now_date.strftime("%Y%m%d_%H%M")}.sav')

        joblib.dump(models[name]['bestModel'], sample_f_path)

        print(f"El tiempo de seleccion fue: {selection_time:0.3f} s")
        print(f"El ROC de la familia {name} es: {models[name]['roc']:0.3f}")
        print('*'*80)
        
        logger.info(f"El tiempo de seleccion fue: {selection_time:0.3f} s")
        logger.info(f"El ROC de la familia {name} es: {models[name]['roc']:0.3f}")
        logger.info('*'*80)        
       
    mod_name = None
    best_roc = 0
    for name in models:
        if models[name]['roc'] > best_roc:
            mod_name = name
            best_roc = models[name]['roc']

    print(f"El mejor modelo fue: {mod_name} con un ROC de: {best_roc}")
    
    logger.info(f"El mejor modelo fue: {mod_name} con un ROC de: {best_roc}")
    
    return models, mod_name

In [None]:
base_path = os.getcwd()
now_date = dt.datetime.now()
path_file = 'data/models'
models, mod_name = grid(base_path, 
                        now_date, 
                        path_file,  
                        X_train_under, 
                        y_train_under, 
                        models, 
                        score = 'roc_auc', 
                        cv =3 , 
                        n_proc = 11,
                        random = True)

### Resultados

El tiempo de seleccion fue: 93.452 s 

El ROC de la familia logistic es: 0.712
********************************************************************************

El tiempo de seleccion fue: 263.394 s

El ROC de la familia ridge_log es: 0.705
********************************************************************************
El tiempo de seleccion fue: 6.242 s

El ROC de la familia naiveBayes es: 0.665
********************************************************************************
El tiempo de seleccion fue: 23.902 s

El ROC de la familia bernoulli es: 0.655
********************************************************************************
El tiempo de seleccion fue: 40.451 s

El ROC de la familia qda es: 0.686
********************************************************************************
El tiempo de seleccion fue: 1523.359 s

El ROC de la familia nn es: 0.714
********************************************************************************
El tiempo de seleccion fue: 1263.119 s

El ROC de la familia rforest es: 0.725
********************************************************************************
El tiempo de seleccion fue: 642.026 s

El ROC de la familia xtree es: 0.707
********************************************************************************
El tiempo de seleccion fue: 6869.806 s

El ROC de la familia gradient es: 0.731
********************************************************************************
El tiempo de seleccion fue: 6037.792 s

El ROC de la familia xgboost es: 0.732
********************************************************************************

In [22]:
modelo_final = {
                     'rforest_final':{
                                'mod': RandomForestClassifier(random_state= 42, bootstrap = True),
                                'par': {'n_estimators':[10,20,30,40,50,60,70,80,90,100,300],
                                        'max_depth': [None, 2, 4, 6, 8, 10, 20],
                                        'criterion':('gini','entropy')
                                       }
                     }
                 }

In [23]:
base_path = os.getcwd()
now_date = dt.datetime.now()
path_file = 'data/models'
models, mod_name = grid(base_path, 
                        now_date, 
                        path_file,  
                        X_train_under, 
                        y_train_under, 
                        modelo_final, 
                        score = 'roc_auc', 
                        cv =3 , 
                        n_proc = 11,
                        random = True,
                        n_iter = 5)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.


El tiempo de seleccion fue: 1.823 s
El ROC de la familia rforest_final es: 0.375
********************************************************************************
El mejor modelo fue: rforest_final con un ROC de: 0.375


[Parallel(n_jobs=11)]: Done  10 out of  15 | elapsed:    1.7s remaining:    0.8s
[Parallel(n_jobs=11)]: Done  15 out of  15 | elapsed:    1.7s finished


El tiempo de seleccion fue: 8220.335 s

El ROC de la familia rforest_final es: 0.727

### Desempeno del modelo en el conjunto de prueba

In [None]:
model = joblib.load('data/models/rforest_final_20200225_2256.sav')

In [None]:
predicciones = model.predict(X_test)
probabilidades = model.predict_proba(X_test)[:,1]

In [None]:
X_test

In [None]:
fscore = f1_score(y_test, predicciones)
roc = roc_auc_score(y_test, probabilidades)
sensibilidad = recall_score(y_test, predicciones)
precision =precision_score(y_test, predicciones)
b_accuracy = balanced_accuracy_score(y_test, predicciones)

print(f'El roc es {roc}')
print(f'La sensibilidad es {sensibilidad}')
print(f'La precision es {precision}')
print(f'El fscore es {fscore}')
print(f'El balanced accuracy es {b_accuracy}')

In [None]:
ax = sns.violinplot(x=y_test , y = probabilidades)

In [None]:
fig, ax = plt.subplots()  

fpr, tpr, _ = roc_curve(y_test, probabilidades, drop_intermediate=False)
roc_auc = roc_auc_score(y_test, probabilidades)

ax.plot(fpr, tpr, color='red', label=f'auc %0.5f' % roc_auc)    
ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
ax.legend(loc="lower right")

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')    
ax.set_title('Receiver operating characteristic (ROC)')

## Analisis de Sensibilidad 

Modelo: Random Forest

In [None]:
times = []
rocs = []
n_estimators = [10,20,30,40,50,60,70,80,90,100,200,300,]
for n in n_estimators:
    
    clf = RandomForestClassifier(n_estimators = n, random_state= 42, bootstrap = True)
    
    start = time.time()
    clf.fit(X_train_under, y_train_under)
    times.append(time.time() - start)
    
    preds = clf.predict_proba(X_val)[:,1]
    
    rocs.append(roc_auc_score(y_val, preds))

In [None]:
ax = plt.figure('roc vs Tiempo Computo - n_estimators')
plt.plot(n_estimators,rocs)
plt.xlabel('n_estimators')
plt.ylabel('ROC_AUC')

In [None]:
ax = plt.figure('n_estimators vs Tiempo Computo')
plt.plot(n_estimators,times)
plt.xlabel('n_estimators')
plt.ylabel('Tiempo Computo (seg)')

In [None]:
times2 = []
rocs2 = []
max_depth = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
for max_d in max_depth:
    
    clf = RandomForestClassifier(max_depth = max_d, random_state= 42, bootstrap = True)
    
    start = time.time()
    clf.fit(X_train_under, y_train_under)
    times2.append(time.time() - start)
    
    preds = clf.predict_proba(X_val)[:,1]
    
    rocs2.append(roc_auc_score(y_val, preds))

In [None]:
ax = plt.figure('roc vs Tiempo Computo - max_depth')
plt.plot(max_depth,rocs2)
plt.xlabel('max_depth')
plt.ylabel('ROC_AUC')

In [None]:
ax = plt.figure('max_depth vs Tiempo Computo')
plt.plot(max_depth,times2)
plt.xlabel('max_depth')
plt.ylabel('Tiempo Computo (seg)')