In [1]:
import os
import json
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
### Realizamos el cambio de directoroi de trabajo al "Directorio Base" que se
current_dir = os.getcwd()
base_path = os.path.dirname(current_dir)

os.chdir(base_path)

In [3]:
import scripts.funciones as funciones
from scripts.clase_model.modelo import Modelo



In [4]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

# Preparacion de Datos

In [5]:
d_ini = dt.datetime(2017,6,1)
d_fin = dt.datetime(2019,8,1)   

In [15]:
version = 'Analisis'
now_date = dt.datetime.now()

cv = 3
freq1 = '96H'
freq2 = '336H'

mod = Modelo(now_date, version, base_path, "")

data = funciones.read_clima_accidentes(d_ini, d_fin, poblado = True)
data_org = funciones.organizar_data_infoClima(data)


### agregamos la informacion relacionada a la cantidad de accidentes ocurridas
### en las ultimas X horas

d_ini_acc = d_ini - dt.timedelta(hours = int(freq2.replace('H', '')))
raw_accidentes = funciones.read_accidentes(d_ini_acc, d_fin)

### Agrega senal a corto plazo
data_org = funciones.obtener_accidentes_acumulados(data_org, 
                                                    raw_accidentes, 
                                                    freq = freq1)

### Agrega senal a largo plazo
data_org = funciones.obtener_accidentes_acumulados(data_org, 
                                                    raw_accidentes, 
                                                    freq = freq2)

data_org['poblado'] = data_org['BARRIO']
data_org= pd.get_dummies(data_org, columns=['poblado'])

X = data_org.drop(columns = ['TW','BARRIO','Accidente','summary'])
Y = data_org['Accidente']

In [7]:
### Dividimos el conjunto de datos en entrenamiento y validacion
X_train, X_val, Y_train, Y_val = train_test_split(X, 
                                                  Y,
                                                  stratify = Y,
                                                  test_size = 0.2,
                                                  random_state = 42)

# Analisis de la proporcion del Random Undersampler

In [11]:
proporciones = np.arange(0.05, 0.55, 0.05)

best_roc = 0
best_prop = 0
best_mod = None
roc_val = []
for prop in proporciones:
    
    print('***'*20)
    print(f'Procesando proporcion {round(prop,2)}')
    
    tra_0 = int(len(Y_train) - Y_train.sum())
    tra_1 = int(Y_train.sum())
    
    prop_deseada_under = prop
    mul_updown = (tra_0 * prop_deseada_under - tra_1 * (1 - prop_deseada_under)) / (tra_0 * prop_deseada_under)   
    fac_1 = int(tra_0 * (1 - mul_updown))
    
    ratio_u = {0 : fac_1, 1 : tra_1}
    rus = RandomUnderSampler(sampling_strategy = ratio_u, random_state=42)
    X_train_set, y_train_set = rus.fit_sample(X_train, Y_train)

    ### Entrena modelo
    
    clf = RandomForestClassifier(bootstrap=True,  
                                 criterion='entropy',
                                 max_features='auto',
                                 n_estimators=500, 
                                 random_state=42,
                                 warm_start=True)
    
    clf.fit(X_train_set,y_train_set)


    ### Obtencion de metricas en el conjunto de validacion
    
    preds = clf.predict_proba(X_val)
    ROC = roc_auc_score(Y_val,preds[:,1])
    
    roc_val.append(ROC)
    
    if ROC > best_roc:
        best_roc = ROC
        best_prop = prop
        best_mod = clf
        
print(f'La mejor proporcion de undersampling es {best_prop}')
print(f'El mejor ROC-AUC en validation es {best_roc}')

************************************************************
Procesando proporcion 0.05
************************************************************
Procesando proporcion 0.1
************************************************************
Procesando proporcion 0.15
************************************************************
Procesando proporcion 0.2
************************************************************
Procesando proporcion 0.25
************************************************************
Procesando proporcion 0.3
************************************************************
Procesando proporcion 0.35
************************************************************
Procesando proporcion 0.4
************************************************************
Procesando proporcion 0.45
************************************************************
Procesando proporcion 0.5
La mejor proporcion de undersampling es 0.5
El mejor ROC-AUC en validation es 0.7457376633973676


In [17]:
res = {'best_prop':best_prop,
       'best_roc': best_roc,
       'props': list(proporciones),
       'roc_val':roc_val}

with open(f'{base_path}/models/{version}/analisis_prop_res.json','w') as json_file:
    json.dump(res, json_file)