# Ejemplo de modelos utilizando 80/20 un año completo

In [2]:
import pandas as pd
import numpy as np

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
df = pd.read_csv('export_TRAINSET_FILTERED_2018_20211016.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452117 entries, 0 to 1452116
Data columns (total 54 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   IDPACIENTE               1452117 non-null  int64  
 1   EDAD                     1452117 non-null  int64  
 2   SEXO                     1452117 non-null  object 
 3   PUEBLO_ORIGINARIO        785218 non-null   object 
 4   PREVISION                1449888 non-null  object 
 5   NACIONALIDAD             1452114 non-null  object 
 6   IDCITA                   1452117 non-null  object 
 7   TIPO_PROFESIONAL         1220199 non-null  object 
 8   PRESTACION               1452117 non-null  object 
 9   DATE_CITA                1452117 non-null  object 
 10  DATE_OF_WEEK             1452117 non-null  int64  
 11  DATE_OF_MONTH            1452117 non-null  int64  
 12  DAYOFYEAR                1452117 non-null  int64  
 13  DAYOFMONTH               1452117 non-null 

In [5]:
# Solo columnas númerica y el TARGET
df = df[['EDAD', 'NSP_30D', 'MINUTE_DAY', 'DATE_OF_WEEK', 
         'NSP_60D', 'NSP_90D', 'NSP_120D', 'NSP_365D', 
         'PREVIOUS_APP_30D', 'PREVIOUS_APP_60D', 'PREVIOUS_APP_90D', 'PREVIOUS_APP_120D', 'PREVIOUS_APP_365D', 
         'TARGET']]

# Nos quedamos con 10k ejemplo para poder probar más rápido
df = df.sample(100000, random_state=45)

# Imputamos -1 a los pacientes que no tienen NSP pasado
df = df.fillna(-1)

In [5]:
# Probamos pero no tiene mejor resultado
#scaler = StandardScaler()
#df[df.columns[:-1]] = scaler.fit_transform(df[df.columns[:-1]])

In [6]:
df.sample(5)

Unnamed: 0,EDAD,NSP_30D,MINUTE_DAY,DATE_OF_WEEK,NSP_60D,NSP_90D,NSP_120D,NSP_365D,PREVIOUS_APP_30D,PREVIOUS_APP_60D,PREVIOUS_APP_90D,PREVIOUS_APP_120D,PREVIOUS_APP_365D,TARGET
1065778,74,-1.0,840,5,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0
180813,4,-1.0,695,5,0.0,0.0,0.0,0.0,0,1,1,1,8,0
79959,5,0.0,1200,5,0.166667,0.142857,0.125,0.125,4,6,7,8,8,0
1250552,13,0.0,570,2,0.0,0.0,0.0,0.0,1,2,2,4,4,1
1051480,46,0.0,645,5,0.0,0.0,0.0,0.0,7,18,19,19,19,0


In [7]:
# Separamos un 80-20
X_train, X_test, y_train, y_test = \
    train_test_split(df.drop(['TARGET'], axis='columns'),
                     df['TARGET'], 
                     test_size=0.2, 
                     random_state=42)

print(f"len(X_train): {len(X_train)} - len(X_test): {len(X_test)}")

len(X_train): 80000 - len(X_test): 20000


## Misc functions

In [22]:
def basic_metrics(y_pred, y):
    """
    """
    # Calculo ROC
    roc = roc_auc_score(y, y_pred[:, 1])
    
    # Cambio las probabilidades a 1/0
    predictions = [1 if x[1] >= 0.5 else 0 for x in y_pred]
    counter = Counter(predictions)
    perc_postivie = counter[1]/len(predictions)

    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    return f"ROC: {roc} - Precision: {precision} - % Positivos: {perc_postivie} - % f1score: {f1score}"

## Xgboost

In [24]:
import xgboost as xgb

### Todos los parámetros por defecto

In [10]:
classifier = xgb.XGBClassifier(eval_metric='logloss', 
                               verbosity=0, 
                               seed=42)

parameters_used = classifier.fit(X_train, y_train)
print(parameters_used)

y_pred = classifier.predict_proba(X_test)
basic_metrics(y_pred, y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, seed=42, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=0)


'ROC: 0.6706812753139082 - Precision: 0.3815789473684211 - % Positivos: 0.0038 - % f1score: 0.021129326047358832'

In [11]:
classifier._Booster.get_score(importance_type='gain')

{'NSP_365D': 15.277752713903626,
 'MINUTE_DAY': 4.518742304902786,
 'EDAD': 3.9962387071068775,
 'DATE_OF_WEEK': 2.9589736774755546,
 'PREVIOUS_APP_120D': 3.7023144652,
 'NSP_30D': 3.5507099270418943,
 'NSP_60D': 3.386514626041885,
 'PREVIOUS_APP_60D': 3.641642121348938,
 'PREVIOUS_APP_365D': 6.296601197211078,
 'PREVIOUS_APP_30D': 3.194221166021302,
 'PREVIOUS_APP_90D': 3.147657079385964,
 'NSP_120D': 3.9218731543951684,
 'NSP_90D': 3.8896090429386514}

### Utilizando el peso de la clase dado el desbalance

In [12]:
# Utilizando el peso de la clase postivia
classifier = xgb.XGBClassifier(eval_metric='logloss', 
                               verbosity=0,
                               scale_pos_weight=df['TARGET'].value_counts().max()/df['TARGET'].value_counts().min(),
                               seed=42)

parameters_used = classifier.fit(X_train, y_train)
print(parameters_used)

y_pred = classifier.predict_proba(X_test)
basic_metrics(y_pred, y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=6.624275693809088, seed=42, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)


'ROC: 0.6655149135020965 - Precision: 0.20752200586823152 - % Positivos: 0.3749 - % f1score: 0.3060883249729517'

In [13]:
classifier._Booster.get_score(importance_type='gain')

{'NSP_365D': 53.9627955662045,
 'PREVIOUS_APP_365D': 29.363313520531005,
 'MINUTE_DAY': 19.423126082108226,
 'EDAD': 15.362211423522707,
 'DATE_OF_WEEK': 13.684064221485157,
 'PREVIOUS_APP_60D': 15.017289811324883,
 'NSP_30D': 13.287485224880186,
 'PREVIOUS_APP_120D': 14.335378116461527,
 'NSP_90D': 14.80039466185102,
 'NSP_60D': 12.196711582321859,
 'PREVIOUS_APP_90D': 12.097701567605617,
 'NSP_120D': 14.304550086290856,
 'PREVIOUS_APP_30D': 11.997551911428241}

### Probando Hyper-parámetros

In [14]:
# Probando hyper-parámetros
# https://xgboost.readthedocs.io/en/latest/parameter.html

distributions_xgb = {'max_depth': [int(x) for x in np.linspace(2, 20, 10)],
                    'min_child_weight': [int(x) for x in np.linspace(1, 10, 10)],
                    'gamma': [x for x in np.linspace(0, 0.5, 6)],
                    'eta': [x for x in np.linspace(0.1, 0.6, 6)],
                    'n_estimators': [int(x) for x in np.linspace(200, 2000, 10)],
                    'tree_method': ['auto', 'exact', 'approx', 'hist'],
                    'alpha': [x for x in np.linspace(0, 0.5, 10)],
                    'lambda':[x for x in np.linspace(0, 0.5, 10)],
                    'subsample': [x for x in np.linspace(0.4, 1, 7)],
                    'colsample_bytree': [x for x in np.linspace(0.6, 1, 5)],
                    'scale_pos_weight': [df['TARGET'].value_counts().max()/df['TARGET'].value_counts().min(), 1],
                    'eval_metric': ['logloss', 'auc', 'aucpr']
                    }

classifier = xgb.XGBClassifier(verbosity=0, seed=42)

grid_search_cv_clf_xgb = RandomizedSearchCV(classifier, 
                                            distributions_xgb, 
                                            n_iter=50, 
                                            scoring='f1', 
                                            n_jobs=-1,
                                            verbose=10,
                                            cv=3, 
                                            random_state=0)  

grid_search_cv_clf_xgb.fit(X_train, y_train)
print(grid_search_cv_clf_xgb.best_params_)

y_pred = grid_search_cv_clf_xgb.predict_proba(X_test)
basic_metrics(y_pred, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 21.3min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 28.6min
[Parallel(n_jobs=-1)]: Done 143 out of 150 | elapsed: 81.1min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 81.6min finished


{'tree_method': 'auto', 'subsample': 1.0, 'scale_pos_weight': 6.624275693809088, 'n_estimators': 2000, 'min_child_weight': 10, 'max_depth': 2, 'lambda': 0.05555555555555555, 'gamma': 0.0, 'eval_metric': 'logloss', 'eta': 0.30000000000000004, 'colsample_bytree': 0.7, 'alpha': 0.0}


'ROC: 0.6690182311699351 - Precision: 0.20576233461110424 - % Positivos: 0.40435 - % f1score: 0.3094087021197471'

In [15]:
grid_search_cv_clf_xgb.best_estimator_._Booster.get_score(importance_type='gain')

{'NSP_365D': 33.472023401929846,
 'PREVIOUS_APP_365D': 8.079973574394778,
 'PREVIOUS_APP_120D': 7.251809213843507,
 'EDAD': 8.958400968626323,
 'MINUTE_DAY': 10.56304993550288,
 'NSP_30D': 7.781779807265562,
 'PREVIOUS_APP_60D': 5.7276777008601645,
 'NSP_60D': 5.823028758065934,
 'PREVIOUS_APP_30D': 5.023899639262432,
 'NSP_90D': 6.816893198585755,
 'NSP_120D': 6.970883682948812,
 'DATE_OF_WEEK': 7.008241409089823,
 'PREVIOUS_APP_90D': 3.327888087333334}

In [6]:
# Caso especial para la variable DIFF Agendamiento

In [16]:
df = pd.read_csv('export_TRAINSET_FILTERED_2018_20211016.csv')

In [17]:
# Solo columnas númerica y el TARGET
df = df[['EDAD', 'NSP_30D', 'MINUTE_DAY', 'DATE_OF_WEEK', 
         'NSP_60D', 'NSP_90D', 'NSP_120D', 'NSP_365D', 
         'PREVIOUS_APP_30D', 'PREVIOUS_APP_60D', 'PREVIOUS_APP_90D', 'PREVIOUS_APP_120D', 'PREVIOUS_APP_365D', 
         'DIFF_FECHA_CITA_AG',
         'TARGET']]

# Nos quedamos con 10k ejemplo para poder probar más rápido
df = df.sample(100000, random_state=45)

# Imputamos -1 a los pacientes que no tienen NSP pasado
df = df.fillna(-1)

In [18]:
len(df[df['DIFF_FECHA_CITA_AG'] > 0])

4019

In [19]:
df['DIFF_FECHA_CITA_AG'] = df['DIFF_FECHA_CITA_AG'].apply(lambda x: 0 if x > 0 else x)

In [20]:
# Separamos un 80-20
X_train, X_test, y_train, y_test = \
    train_test_split(df.drop(['TARGET'], axis='columns'),
                     df['TARGET'], 
                     test_size=0.2, 
                     random_state=42)

print(f"len(X_train): {len(X_train)} - len(X_test): {len(X_test)}")

len(X_train): 80000 - len(X_test): 20000


In [25]:
# Utilizando el peso de la clase postivia
classifier = xgb.XGBClassifier(eval_metric='logloss', 
                               verbosity=0,
                               scale_pos_weight=df['TARGET'].value_counts().max()/df['TARGET'].value_counts().min(),
                               seed=42)

parameters_used = classifier.fit(X_train, y_train)
print(parameters_used)

y_pred = classifier.predict_proba(X_test)
basic_metrics(y_pred, y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=6.624275693809088, seed=42, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)


'ROC: 0.7191199802475068 - Precision: 0.22531969309462915 - % Positivos: 0.391 - % f1score: 0.3359710172561732'

In [27]:
classifier.get_booster().get_score(importance_type='gain')

{'DIFF_FECHA_CITA_AG': 75.85803413489998,
 'NSP_365D': 42.62306914775828,
 'PREVIOUS_APP_365D': 22.00686830756428,
 'EDAD': 15.96158606651231,
 'DATE_OF_WEEK': 11.776131957269227,
 'PREVIOUS_APP_120D': 15.332583479293014,
 'NSP_90D': 15.92213115928273,
 'NSP_30D': 12.51155123052712,
 'MINUTE_DAY': 13.588530218480884,
 'PREVIOUS_APP_90D': 12.703682773552634,
 'PREVIOUS_APP_60D': 12.637106353245093,
 'NSP_120D': 12.919015118260875,
 'PREVIOUS_APP_30D': 11.15941163689532,
 'NSP_60D': 14.088530551585356}

## Random forest

In [16]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)
basic_metrics(y_pred, y_test)

'ROC: 0.6142182064641856 - Precision: 0.25834542815674894 - % Positivos: 0.03445 - % f1score: 0.10601548540798095'

In [17]:
clf = RandomForestClassifier(random_state=42, class_weight=Counter(df['TARGET']))
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)
basic_metrics(y_pred, y_test)

'ROC: 0.6099344569087993 - Precision: 0.251896813353566 - % Positivos: 0.03295 - % f1score: 0.09975961538461539'

In [18]:
distributions_forest = {'n_estimators': [100, 300, 500, 800, 1200],
                        'max_depth': [5, 8, 15, 25, 30],
                        'min_samples_split': [2, 5, 10, 15, 100],
                        'min_samples_leaf': [1, 2, 5, 10] ,
                        }

clf = RandomForestClassifier(random_state=42, class_weight=Counter(df['TARGET']))

gridF = RandomizedSearchCV(clf, 
                           distributions_forest, 
                           cv = 3, 
                           scoring='f1',
                           n_iter=50,
                           verbose = 10, 
                           random_state=0,
                           n_jobs = -1)
                                            

bestF = gridF.fit(X_train, y_train)
print(bestF.best_params_)

y_pred = bestF.predict_proba(X_test)
basic_metrics(y_pred, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 143 out of 150 | elapsed: 12.6min remaining:   36.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 12.9min finished


{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}


'ROC: 0.6176191491956395 - Precision: 0.25811965811965815 - % Positivos: 0.02925 - % f1score: 0.09280885064535956'

## Agregando variables categorícas

In [19]:
df = pd.read_csv('export_TRAINSET_FILTERED_2018_20211016.csv')

In [20]:
# Solo columnas númerica y el TARGET
df = df[['EDAD', 'SEXO',
         'MINUTE_DAY', 'DATE_OF_WEEK', 
         'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD',
         'NSP_30D', 'NSP_60D', 'NSP_90D', 'NSP_120D', 'NSP_365D', 
         'PREVIOUS_APP_30D', 'PREVIOUS_APP_60D', 'PREVIOUS_APP_90D', 'PREVIOUS_APP_120D', 'PREVIOUS_APP_365D', 
         'TARGET']]

# Nos quedamos con 10k ejemplo para poder probar más rápido
df = df.sample(100000, random_state=45)

# Imputamos -1 a los pacientes que no tienen NSP pasado
df = df.fillna(-1)

In [21]:
# https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0
for cat in ['NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
    df[cat] = df[cat].str.replace(' ', '_')
    df[cat] = df[cat].mask(df[cat].map(df[cat].value_counts(normalize=True)) < 0.01, 'Other')

In [22]:
# One hot encoder
for cat in ['SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
    df_dummies = pd.get_dummies(df[cat].str.upper(), prefix=cat[:3] + '_')
    df = pd.concat([df, df_dummies], axis=1)

In [23]:
df.sample(5)

Unnamed: 0,EDAD,SEXO,MINUTE_DAY,DATE_OF_WEEK,NACIONALIDAD,TIPO_PROFESIONAL,PRESTACION,ESPECIALIDAD,NSP_30D,NSP_60D,...,ESP__OTORRINOLARINGOLOGÍA,ESP__PERIODONCIA,ESP__PSIQUIATRÍA_ADULTO,ESP__PSIQUIATRÍA_INFANTIL,ESP__REHABILITACIÓN_PRÓTESIS_REMOVIBLE,ESP__REUMATOLOGÍA,ESP__TRAUMATOLOGÍA_Y_ORTOPEDIA_ADULTO,ESP__TRAUMATOLOGÍA_Y_ORTOPEDIA_PEDIÁTRICA,ESP__UNIDAD_DE_PATOLOGÍA_CERVICAL_,ESP__UROLOGÍA_ADULTO
683478,78,Mujer,680,5,Chile,Médico,Consulta_Integral_De_Especialidades_En_Medic_I...,Neurología_Adulto,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
1214723,29,Mujer,720,5,Chile,Médico,Other,,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1114896,91,Mujer,930,5,Chile,Enfermera_(o),"Consulta_O_Control_Por_Enfermera,_Matrona_O_Nu...",,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
589534,11,Hombre,645,6,Chile,Psicólogo_(a),Consulta_O_Control_Por_Psicologo_Clinico,Psiquiatría_Infantil,0.166667,0.125,...,0,0,0,1,0,0,0,0,0,0
1394536,87,Hombre,795,2,Chile,Médico,"Consulta_Integral_Especialidad_Urologia,_Orl,_...",Dermatología,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0


## Eliminamos las variables transformadas

In [24]:
# Separamos un 80-20
X_train, X_test, y_train, y_test = \
    train_test_split(df.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD'],
                             axis='columns'),
                     df['TARGET'], 
                     test_size=0.2, 
                     random_state=42)

print(f"len(X_train): {len(X_train)} - len(X_test): {len(X_test)}")

len(X_train): 80000 - len(X_test): 20000


### Todos los parámetros por defecto

In [25]:
classifier = xgb.XGBClassifier(eval_metric='logloss', 
                               verbosity=0, 
                               seed=42)

parameters_used = classifier.fit(X_train, y_train)
print(parameters_used)

y_pred = classifier.predict_proba(X_test)
basic_metrics(y_pred, y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, seed=42, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=0)


'ROC: 0.7250153973158201 - Precision: 0.5317460317460317 - % Positivos: 0.0126 - % f1score: 0.09174940089010612'

In [26]:
classifier._Booster.get_score(importance_type='gain')

{'NSP_365D': 17.09082959547211,
 'PRE__ODONTOLOGIA_ESPECIALIDAD_PRIMERAS_CONSULTAS': 29.235071431,
 'PRE__OTHER': 10.985361336799999,
 'EDAD': 5.023758932756615,
 'TIP__ODONTÓLOGO/DENTISTA': 19.988267645833332,
 'ESP__REHABILITACIÓN_PRÓTESIS_REMOVIBLE': 14.206308927,
 'ESP__OTHER': 5.226194516864865,
 'MINUTE_DAY': 4.701476783680663,
 'ESP__ORTODONCIA': 5.579380864545454,
 'DATE_OF_WEEK': 3.2138874061030562,
 'PREVIOUS_APP_365D': 6.636684643784978,
 'ESP__HEMATOLOGÍA_ADULTO': 33.78441331995652,
 'PREVIOUS_APP_60D': 4.304209944871036,
 'NSP_90D': 4.800574286678833,
 'TIP__PSICÓLOGO_(A)': 12.6660531465,
 'TIP__TÉCNICO_PARAMÉDICO': 6.7857076354285715,
 'NSP_120D': 5.739297938664384,
 'NSP_60D': 3.9990282767575756,
 'ESP__GINECOLOGÍA_ADULTO': 6.5553170755,
 'NSP_30D': 5.093685061518518,
 'PRE__VENOSA_EN_ADULTOS': 6.682030754,
 'PRE__CONSULTA_O_CONTROL_POR_ENFERMERA,_MATRONA_O_NUTRICIONISTA': 8.431841537333334,
 'TIP__TERAPEUTA_OCUPACIONAL': 5.945416338205883,
 'ESP__MED._INTERNA': 13.35665

### Utilizando el peso de la clase dado el desbalance

In [27]:
# Utilizando el peso de la clase postivia
classifier = xgb.XGBClassifier(eval_metric='logloss', 
                               verbosity=0,
                               scale_pos_weight=df['TARGET'].value_counts().max()/df['TARGET'].value_counts().min(),
                               seed=42)

parameters_used = classifier.fit(X_train, y_train)
print(parameters_used)

y_pred = classifier.predict_proba(X_test)
basic_metrics(y_pred, y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=6.624275693809088, seed=42, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)


'ROC: 0.7212861651542177 - Precision: 0.240081979212414 - % Positivos: 0.34155 - % f1score: 0.3452631578947368'

In [28]:
classifier._Booster.get_score(importance_type='gain')

{'NSP_365D': 64.20198569706244,
 'PREVIOUS_APP_365D': 27.988003869157538,
 'PRE__OTHER': 51.42097788205882,
 'EDAD': 18.210367317621255,
 'PRE__ODONTOLOGIA_ESPECIALIDAD_PRIMERAS_CONSULTAS': 54.005853726153845,
 'TIP__ENFERMERA_(O)': 94.52199345938462,
 'ESP__ORTODONCIA': 11.2083595175,
 'ESP__HEMATOLOGÍA_ADULTO': 205.53984992571426,
 'ESP__REHABILITACIÓN_PRÓTESIS_REMOVIBLE': 65.56690746166667,
 'MINUTE_DAY': 18.360949590085415,
 'ESP__OTHER': 20.248443490731717,
 'ESP__GASTROENTEROLOGÍA_ADULTO': 15.586717104000002,
 'TIP__ODONTÓLOGO/DENTISTA': 117.79052605000004,
 'PREVIOUS_APP_120D': 16.39939546153109,
 'TIP__MÉDICO': 21.57253842757576,
 'PRE__ODONTOLOGIA_ESPECIALIDAD_CONSULTAS_REPETIDAS': 32.07140697142857,
 'ESP__MED._INTERNA': 79.0895107005263,
 'NSP_30D': 15.4247557262549,
 'TIP__PSICÓLOGO_(A)': 54.442822590526305,
 'PREVIOUS_APP_60D': 17.164459087772453,
 'TIP__TERAPEUTA_OCUPACIONAL': 19.173351751666672,
 'DATE_OF_WEEK': 11.540821960576762,
 'PREVIOUS_APP_30D': 14.018957322305726

### Probando hyper-parámetros

In [29]:
# Probando hyper-parámetros
# https://xgboost.readthedocs.io/en/latest/parameter.html

distributions_xgb = {'max_depth': [int(x) for x in np.linspace(2, 20, 10)],
                    'min_child_weight': [int(x) for x in np.linspace(1, 10, 10)],
                    'gamma': [x for x in np.linspace(0, 0.5, 6)],
                    'eta': [x for x in np.linspace(0.1, 0.6, 6)],
                    'n_estimators': [int(x) for x in np.linspace(200, 2000, 10)],
                    'tree_method': ['auto', 'exact', 'approx', 'hist'],
                    'alpha': [x for x in np.linspace(0, 0.5, 10)],
                    'lambda':[x for x in np.linspace(0, 0.5, 10)],
                    'subsample': [x for x in np.linspace(0.4, 1, 7)],
                    'colsample_bytree': [x for x in np.linspace(0.6, 1, 5)],
                    'scale_pos_weight': [df['TARGET'].value_counts().max()/df['TARGET'].value_counts().min(), 1],
                    'eval_metric': ['logloss', 'auc', 'aucpr']
                    }

classifier = xgb.XGBClassifier(verbosity=0, seed=42)

grid_search_cv_clf_xgb = RandomizedSearchCV(classifier, 
                                            distributions_xgb, 
                                            n_iter=50, 
                                            scoring='f1', 
                                            n_jobs=-1,
                                            verbose=10,
                                            cv=3, 
                                            random_state=0)  

grid_search_cv_clf_xgb.fit(X_train, y_train)
print(grid_search_cv_clf_xgb.best_params_)

y_pred = grid_search_cv_clf_xgb.predict_proba(X_test)
basic_metrics(y_pred, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 33.3min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 59.0min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 71.7min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 75.1min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 85.6min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 180.4min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 197.1min
[Parallel(n_jobs=-1)]: Done 143 out of 150 | elapsed: 219.1min remaining: 10.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 222.5min finished


{'tree_method': 'auto', 'subsample': 1.0, 'scale_pos_weight': 6.624275693809088, 'n_estimators': 2000, 'min_child_weight': 10, 'max_depth': 2, 'lambda': 0.05555555555555555, 'gamma': 0.0, 'eval_metric': 'logloss', 'eta': 0.30000000000000004, 'colsample_bytree': 0.7, 'alpha': 0.0}


'ROC: 0.7198216339134968 - Precision: 0.23097752504738694 - % Positivos: 0.3693 - % f1score: 0.33933366484336147'

In [30]:
grid_search_cv_clf_xgb.best_estimator_._Booster.get_score(importance_type='gain')

{'NSP_120D': 29.029190651153847,
 'PREVIOUS_APP_365D': 12.635491592610501,
 'NSP_365D': 32.68892364498757,
 'ESP__HEMATOLOGÍA_ADULTO': 197.8370626921818,
 'TIP__ENFERMERA_(O)': 40.80778966822369,
 'EDAD': 13.031184361287048,
 'TIP__ODONTÓLOGO/DENTISTA': 43.097186306117635,
 'PRE__ODONTOLOGIA_ESPECIALIDAD_PRIMERAS_CONSULTAS': 31.77905596384209,
 'PREVIOUS_APP_120D': 8.110553262997167,
 'MINUTE_DAY': 11.507389653135737,
 'PREVIOUS_APP_30D': 5.586187015917907,
 'ESP__REHABILITACIÓN_PRÓTESIS_REMOVIBLE': 28.139299159703686,
 'ESP__MED._INTERNA': 24.9926379413889,
 'TIP__PSICÓLOGO_(A)': 20.09392937110638,
 'ESP__ONCOLOGÍA_ADULTO': 17.687963074529407,
 'PRE__CURACION_SIMPLE_AMBULATORIA': 53.857104913,
 'TIP__CIRUJANO_DENTISTA': 36.239840494307686,
 'NSP_30D': 11.394344006142859,
 'NSP_90D': 9.776622472715738,
 'TIP__NUTRICIONISTA': 19.88525851548215,
 'PRE__OTHER': 18.765417711656244,
 'ESP__OTHER': 11.639766354545456,
 'PRE__ODONTOLOGIA_ESPECIALIDAD_CONSULTAS_REPETIDAS': 29.824631238823525,


In [31]:
## Random forest

In [32]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)
basic_metrics(y_pred, y_test)

'ROC: 0.6985621072127928 - Precision: 0.412 - % Positivos: 0.025 - % f1score: 0.13000946670874095'

In [33]:
clf = RandomForestClassifier(random_state=42, class_weight=Counter(df['TARGET']))
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)
basic_metrics(y_pred, y_test)

'ROC: 0.7018779698973372 - Precision: 0.40409683426443205 - % Positivos: 0.02685 - % f1score: 0.1353711790393013'

In [34]:
distributions_forest = {'n_estimators': [100, 300, 500, 800, 1200],
                        'max_depth': [5, 8, 15, 25, 30],
                        'min_samples_split': [2, 5, 10, 15, 100],
                        'min_samples_leaf': [1, 2, 5, 10] ,
                        }

clf = RandomForestClassifier(random_state=42, class_weight=Counter(df['TARGET']))

gridF = RandomizedSearchCV(clf, 
                           distributions_forest, 
                           cv = 3, 
                           scoring='f1',
                           n_iter=50,
                           verbose = 10, 
                           random_state=0,
                           n_jobs = -1)
                                            

bestF = gridF.fit(X_train, y_train)
print(bestF.best_params_)

y_pred = bestF.predict_proba(X_test)
basic_metrics(y_pred, y_test)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 143 out of 150 | elapsed: 21.5min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 22.3min finished


{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}


'ROC: 0.7130248396336778 - Precision: 0.508 - % Positivos: 0.0125 - % f1score: 0.0870161014045906'

## Prueba utilizando el año de la pandemia

In [54]:
df = pd.read_csv('export_TRAINSET_FILTERED_2020_20211016.csv')

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 897006 entries, 0 to 897005
Data columns (total 57 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   IDPACIENTE               897006 non-null  int64  
 1   EDAD                     896997 non-null  float64
 2   SEXO                     897006 non-null  object 
 3   PUEBLO_ORIGINARIO        694218 non-null  object 
 4   PREVISION                892882 non-null  object 
 5   NACIONALIDAD             896974 non-null  object 
 6   IDCITA                   897006 non-null  object 
 7   TIPO_PROFESIONAL         820510 non-null  object 
 8   PRESTACION               897006 non-null  object 
 9   DATE_CITA                897006 non-null  object 
 10  DATE_OF_WEEK             897006 non-null  int64  
 11  DATE_OF_MONTH            897006 non-null  int64  
 12  DAYOFYEAR                897006 non-null  int64  
 13  DAYOFMONTH               897006 non-null  int64  
 14  COMU

In [56]:
# Me quedo solo con los datos de atención en pandemia
df = df.loc[df['ATENCION_EN_PANDEMIA'] == 'Si']

In [57]:
# Solo columnas númerica y el TARGET
df = df[['EDAD', 'SEXO',
         'MINUTE_DAY', 'DATE_OF_WEEK', 
         'APP_DISTANCE', 'APP_CALL', 'APP_VIDEOCALL',
         'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD',
         'NSP_30D', 'NSP_60D', 'NSP_90D', 'NSP_120D', 'NSP_365D', 
         'PREVIOUS_APP_30D', 'PREVIOUS_APP_60D', 'PREVIOUS_APP_90D', 'PREVIOUS_APP_120D', 'PREVIOUS_APP_365D', 
         'TARGET']]

# Nos quedamos con 10k ejemplo para poder probar más rápido
#df = df.sample(100000, random_state=45)

# Imputamos -1 a los pacientes que no tienen NSP pasado
df = df.fillna(-1)

In [58]:
# https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0
for cat in ['NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
    df[cat] = df[cat].str.replace(' ', '_')
    df[cat] = df[cat].mask(df[cat].map(df[cat].value_counts(normalize=True)) < 0.01, 'Other')

In [59]:
# One hot encoder
for cat in ['SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
    df_dummies = pd.get_dummies(df[cat].str.upper(), prefix=cat[:3] + '_')
    df = pd.concat([df, df_dummies], axis=1)

In [60]:
# Generamos el 80-20
X_train, X_test, y_train, y_test = \
    train_test_split(df.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD'],
                             axis='columns'),
                     df['TARGET'], 
                     test_size=0.2, 
                     random_state=42)

print(f"len(X_train): {len(X_train)} - len(X_test): {len(X_test)}")

len(X_train): 498352 - len(X_test): 124588


In [61]:
# Utilizando el peso de la clase postivia
classifier = xgb.XGBClassifier(eval_metric='logloss', 
                               verbosity=0,
                               scale_pos_weight=df['TARGET'].value_counts().max()/df['TARGET'].value_counts().min(),
                               seed=42)

parameters_used = classifier.fit(X_train, y_train)
print(parameters_used)

y_pred = classifier.predict_proba(X_test)
basic_metrics(y_pred, y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=9.262264834766565, seed=42, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)


'ROC: 0.7480747663262045 - Precision: 0.173921871193048 - % Positivos: 0.39531897132950206 - % f1score: 0.27962394724815565'

# Creando 24 puntos de tiempo

In [87]:
import dateutil.relativedelta
import dateutil.parser as dparser

In [93]:
df_2018 = pd.read_csv('export_TRAINSET_FILTERED_2018_20211016.csv')
df_2019 = pd.read_csv('export_TRAINSET_FILTERED_2019_20211016.csv')
df_2020 = pd.read_csv('export_TRAINSET_FILTERED_2020_20211016.csv')

In [121]:
df = pd.concat([df_2018, df_2019, df_2020])

In [154]:
df = df[['EDAD', 'SEXO',
         'DATE_CITA', 'MINUTE_DAY', 'DATE_OF_WEEK', 
         'APP_DISTANCE', 'APP_CALL', 'APP_VIDEOCALL',
         'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD',
         'NSP_30D', 'NSP_60D', 'NSP_90D', 'NSP_120D', 'NSP_365D', 
         'PREVIOUS_APP_30D', 'PREVIOUS_APP_60D', 'PREVIOUS_APP_90D', 'PREVIOUS_APP_120D', 'PREVIOUS_APP_365D', 
         'TARGET']]

In [155]:
#Pareaseamos la fecha
df['DATE_CITA'] = pd.to_datetime(df['DATE_CITA'])

# Nos quedamos con 10k ejemplo para poder probar más rápido
#df = df.sample(100000, random_state=45)

# Imputamos -1 a los pacientes que no tienen NSP pasado
df = df.fillna(-1)

In [124]:
lst_aux_data = []

for x in range(24):
    
    pit = dparser.parse('2019-01-01', fuzzy=True) +  dateutil.relativedelta.relativedelta(months=x)
    
    start_pred_data = pit
    end_pred_data = pit +  dateutil.relativedelta.relativedelta(months=1) -  dateutil.relativedelta.relativedelta(days=1)

    
    start_train_data = pit -  dateutil.relativedelta.relativedelta(months=12) 
    end_train_data = pit - dateutil.relativedelta.relativedelta(days=1)
    
    lst_aux_data.append({'pit':pit, 
                         'start_pred_data': start_pred_data,
                         'end_pred_data': end_pred_data,
                         'start_train_data': start_train_data,
                         'end_train_data': end_train_data})
    
df_aux_dt_data = pd.DataFrame(lst_aux_data)    

In [125]:
df_aux_dt_data.sample(5)

Unnamed: 0,pit,start_pred_data,end_pred_data,start_train_data,end_train_data
13,2020-02-01,2020-02-01,2020-02-29,2019-02-01,2020-01-31
14,2020-03-01,2020-03-01,2020-03-31,2019-03-01,2020-02-29
23,2020-12-01,2020-12-01,2020-12-31,2019-12-01,2020-11-30
10,2019-11-01,2019-11-01,2019-11-30,2018-11-01,2019-10-31
19,2020-08-01,2020-08-01,2020-08-31,2019-08-01,2020-07-31


In [137]:
list_df_pit_fi = []

for _, pit, start_pred_data, end_pred_data, start_train_data, end_train_data in df_aux_dt_data.itertuples():
    
    print('PIT: ', pit)
    
    df_pit_train_data = df[(df['DATE_CITA'] >= start_train_data) & (df['DATE_CITA'] <= end_train_data)]
    df_pit_predict_data = df[(df['DATE_CITA'] >= start_pred_data) & (df['DATE_CITA'] <= end_pred_data)]

    print(f'[TRAIN] Start date: {start_train_data} - End date: {end_train_data} - N rows: {len(df_pit_train_data)}')
    
    # https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0
    for cat in ['NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
        df_pit_train_data[cat] = df_pit_train_data[cat].str.replace(' ', '_')
        df_pit_train_data[cat] = df_pit_train_data[cat].\
                                    mask(df_pit_train_data[cat].\
                                    map(df_pit_train_data[cat].value_counts(normalize=True)) < 0.01, 'Other')
        
        df_pit_predict_data[cat] = df_pit_predict_data[cat].str.replace(' ', '_')
        df_pit_predict_data[cat] = df_pit_predict_data[cat].\
                                    mask(df_pit_predict_data[cat].\
                                    map(df_pit_predict_data[cat].value_counts(normalize=True)) < 0.01, 'Other')
        
        
    # One hot encoder
    for cat in ['SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
        df_dummies = pd.get_dummies(df_pit_train_data[cat].str.upper(), prefix=cat[:3] + '_')
        df_pit_train_data = pd.concat([df_pit_train_data, df_dummies], axis=1)

        df_dummies = pd.get_dummies(df_pit_predict_data[cat].str.upper(), prefix=cat[:3] + '_')
        df_pit_predict_data = pd.concat([df_pit_predict_data, df_dummies], axis=1)

    
    # Agregamos las columnas como 0 para que el predict y el train tengan las mismas columnas
    for falta in set(df_pit_train_data.columns) - set(df_pit_predict_data.columns):
        df_pit_predict_data[falta] = 0
        
    for falta in set(df_pit_predict_data.columns) - set(df_pit_train_data.columns):
        df_pit_train_data[falta] = 0
        
    # Utilizando el peso de la clase postivia
    weight = df_pit_train_data['TARGET'].value_counts().max()/df_pit_train_data['TARGET'].value_counts().min()
    
    classifier = xgb.XGBClassifier(eval_metric='logloss', 
                                   verbosity=0,
                                   scale_pos_weight=weight,
                                   seed=42)

    X_train = df_pit_train_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                      'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
    y_train = df_pit_train_data['TARGET']
    
    X_test = df_pit_predict_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                       'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
    y_test = df_pit_predict_data['TARGET']
    
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict_proba(X_test)
    print(basic_metrics(y_pred, y_test))
    
    # Creo un dataframe con las features importance
    df_fi = pd.DataFrame([classifier._Booster.get_score(importance_type='gain')])
    df_fi['PIT'] = pit

    list_df_pit_fi.append(df_fi)

    print()

PIT:  2019-01-01 00:00:00
[TRAIN] Start date: 2018-01-01 00:00:00 - End date: 2018-12-31 00:00:00 - N rows: 1452117
ROC: 0.703629665298877 - Precision: 0.21831133459040436 - % Positivos: 0.33818551117129314 - % f1score: 0.3203689469638739

PIT:  2019-02-01 00:00:00
[TRAIN] Start date: 2018-02-01 00:00:00 - End date: 2019-01-31 00:00:00 - N rows: 1467292
ROC: 0.7458506226326574 - Precision: 0.24012373708707005 - % Positivos: 0.35581855636789594 - % f1score: 0.3524976044661084

PIT:  2019-03-01 00:00:00
[TRAIN] Start date: 2018-03-01 00:00:00 - End date: 2019-02-28 00:00:00 - N rows: 1474018
ROC: 0.6199735285887523 - Precision: 0.17401680407084533 - % Positivos: 0.40201714253998205 - % f1score: 0.26528157791875884

PIT:  2019-04-01 00:00:00
[TRAIN] Start date: 2018-04-01 00:00:00 - End date: 2019-03-31 00:00:00 - N rows: 1480680
ROC: 0.7282223753002907 - Precision: 0.22245258857247394 - % Positivos: 0.3890813324475657 - % f1score: 0.3347966410205424

PIT:  2019-05-01 00:00:00
[TRAIN] Sta

In [141]:
df_fi = pd.concat(list_df_pit_fi)

In [144]:
df_fi.sample(5)

Unnamed: 0,NSP_365D,PRE__OTHER,EDAD,PRE__ODONTOLOGIA_ESPECIALIDAD_PRIMERAS_CONSULTAS,TIP__ENFERMERA_(O),ESP__ORTODONCIA,ESP__HEMATOLOGÍA_ADULTO,MINUTE_DAY,ESP__CARDIOLOGÍA_ADULTO,TIP__ODONTÓLOGO/DENTISTA,...,SEX__INTERSEX (INDETERMINADO),APP_DISTANCE,APP_VIDEOCALL,APP_CALL,ESP__INMUNOLOGÍA_ADULTO,PRE__CONSULTA_DE_PSIQUIATRIA_(A_DISTANCIA_POR_CONTINGENCIA_COVID-19),ESP__DIABETES,PRE__DESPACHO_DE_RECETAS_A_CRONICOS,TIP__PSIQUIATRÍA,PRE__CONSULTA_MEDICA_ABREVIADA
0,989.201386,113.545882,135.042898,599.800159,598.611095,132.899848,3127.267482,82.560135,244.257142,1773.457437,...,,,,,,,,,,
0,809.807938,171.709827,132.620323,784.247871,658.547411,89.679814,3213.533119,94.514865,394.56248,1286.936666,...,,,,,,,,,,
0,594.679347,132.628377,128.760637,,603.549004,86.165308,2399.66593,72.136728,163.812555,1724.802255,...,30.834919,78.335372,34.272102,63.091041,,,,,,
0,799.204282,347.113341,138.545032,799.308983,718.28561,96.652725,4169.919348,127.590085,454.800072,1614.447268,...,,,,,,,,,,
0,807.340394,76.228079,147.865124,541.328132,655.77023,125.468774,2484.55749,77.428055,240.633043,1879.400669,...,18.269051,43.786395,,,,,,,,


# Solo desde la pandemia

In [156]:
lst_aux_data = []

for x in range(9):
    
    pit = dparser.parse('2020-04-01', fuzzy=True) +  dateutil.relativedelta.relativedelta(months=x)
    
    start_pred_data = pit
    end_pred_data = pit +  dateutil.relativedelta.relativedelta(months=1) -  dateutil.relativedelta.relativedelta(days=1)

    
    start_train_data = dparser.parse('2020-03-01', fuzzy=True)
    end_train_data = dparser.parse('2020-03-01', fuzzy=True) + dateutil.relativedelta.relativedelta(months=x + 1) -  dateutil.relativedelta.relativedelta(days=1)
    
    lst_aux_data.append({'pit':pit, 
                         'start_pred_data': start_pred_data,
                         'end_pred_data': end_pred_data,
                         'start_train_data': start_train_data,
                         'end_train_data': end_train_data})
    
df_aux_dt_data = pd.DataFrame(lst_aux_data)    

In [157]:
df_aux_dt_data

Unnamed: 0,pit,start_pred_data,end_pred_data,start_train_data,end_train_data
0,2020-04-01,2020-04-01,2020-04-30,2020-03-01,2020-03-31
1,2020-05-01,2020-05-01,2020-05-31,2020-03-01,2020-04-30
2,2020-06-01,2020-06-01,2020-06-30,2020-03-01,2020-05-31
3,2020-07-01,2020-07-01,2020-07-31,2020-03-01,2020-06-30
4,2020-08-01,2020-08-01,2020-08-31,2020-03-01,2020-07-31
5,2020-09-01,2020-09-01,2020-09-30,2020-03-01,2020-08-31
6,2020-10-01,2020-10-01,2020-10-31,2020-03-01,2020-09-30
7,2020-11-01,2020-11-01,2020-11-30,2020-03-01,2020-10-31
8,2020-12-01,2020-12-01,2020-12-31,2020-03-01,2020-11-30


In [158]:
list_df_pit_fi = []

for _, pit, start_pred_data, end_pred_data, start_train_data, end_train_data in df_aux_dt_data.itertuples():
    
    print('PIT: ', pit)
    
    df_pit_train_data = df[(df['DATE_CITA'] >= start_train_data) & (df['DATE_CITA'] <= end_train_data)]
    df_pit_predict_data = df[(df['DATE_CITA'] >= start_pred_data) & (df['DATE_CITA'] <= end_pred_data)]

    print(f'[TRAIN] Start date: {start_train_data} - End date: {end_train_data} - N rows: {len(df_pit_train_data)}')
    
    # https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0
    for cat in ['NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
        df_pit_train_data[cat] = df_pit_train_data[cat].str.replace(' ', '_')
        df_pit_train_data[cat] = df_pit_train_data[cat].\
                                    mask(df_pit_train_data[cat].\
                                    map(df_pit_train_data[cat].value_counts(normalize=True)) < 0.01, 'Other')
        
        df_pit_predict_data[cat] = df_pit_predict_data[cat].str.replace(' ', '_')
        df_pit_predict_data[cat] = df_pit_predict_data[cat].\
                                    mask(df_pit_predict_data[cat].\
                                    map(df_pit_predict_data[cat].value_counts(normalize=True)) < 0.01, 'Other')
        
        
    # One hot encoder
    for cat in ['SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
        df_dummies = pd.get_dummies(df_pit_train_data[cat].str.upper(), prefix=cat[:3] + '_')
        df_pit_train_data = pd.concat([df_pit_train_data, df_dummies], axis=1)

        df_dummies = pd.get_dummies(df_pit_predict_data[cat].str.upper(), prefix=cat[:3] + '_')
        df_pit_predict_data = pd.concat([df_pit_predict_data, df_dummies], axis=1)

    
    # Agregamos las columnas como 0 para que el predict y el train tengan las mismas columnas
    for falta in set(df_pit_train_data.columns) - set(df_pit_predict_data.columns):
        df_pit_predict_data[falta] = 0
        
    for falta in set(df_pit_predict_data.columns) - set(df_pit_train_data.columns):
        df_pit_train_data[falta] = 0
        
    # Utilizando el peso de la clase postivia
    weight = df_pit_train_data['TARGET'].value_counts().max()/df_pit_train_data['TARGET'].value_counts().min()
    
    classifier = xgb.XGBClassifier(eval_metric='logloss', 
                                   verbosity=0,
                                   scale_pos_weight=weight,
                                   seed=42)

    X_train = df_pit_train_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                      'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
    y_train = df_pit_train_data['TARGET']
    
    X_test = df_pit_predict_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                       'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
    y_test = df_pit_predict_data['TARGET']
    
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict_proba(X_test)
    print(basic_metrics(y_pred, y_test))
    
    # Creo un dataframe con las features importance
    df_fi = pd.DataFrame([classifier._Booster.get_score(importance_type='gain')])
    df_fi['PIT'] = pit

    list_df_pit_fi.append(df_fi)

    print()

PIT:  2020-04-01 00:00:00
[TRAIN] Start date: 2020-03-01 00:00:00 - End date: 2020-03-31 00:00:00 - N rows: 100805
ROC: 0.6079071097202934 - Precision: 0.09149310251555315 - % Positivos: 0.3303768905967248 - % f1score: 0.153645241880536

PIT:  2020-05-01 00:00:00
[TRAIN] Start date: 2020-03-01 00:00:00 - End date: 2020-04-30 00:00:00 - N rows: 145566
ROC: 0.619079297631633 - Precision: 0.13517216184126404 - % Positivos: 0.2752941768078045 - % f1score: 0.20566981656475822

PIT:  2020-06-01 00:00:00
[TRAIN] Start date: 2020-03-01 00:00:00 - End date: 2020-05-31 00:00:00 - N rows: 185338
ROC: 0.5863652069014583 - Precision: 0.11681921070760752 - % Positivos: 0.2442313239111624 - % f1score: 0.17479016345162715

PIT:  2020-07-01 00:00:00
[TRAIN] Start date: 2020-03-01 00:00:00 - End date: 2020-06-30 00:00:00 - N rows: 226942
ROC: 0.645813920351078 - Precision: 0.1496592696138389 - % Positivos: 0.21422822811581727 - % f1score: 0.21799440061084246

PIT:  2020-08-01 00:00:00
[TRAIN] Start date

In [159]:
df_fi = pd.concat(list_df_pit_fi)

In [163]:
df_fi.sample()

Unnamed: 0,PREVIOUS_APP_365D,TIP__ODONTÓLOGO/DENTISTA,TIP__ENFERMERA_(O),TIP__CIRUJANO_DENTISTA,EDAD,ESP__OTHER,ESP__NEUROLOGÍA_ADULTO,PRE__ODONTOLOGIA_ESPECIALIDAD_CONSULTAS_REPETIDAS,MINUTE_DAY,ESP__GINECOLOGÍA_ADULTO,...,PRE__CONSULTA_MEDICA_ABREVIADA,TIP__PSIQUIATRÍA,APP_CALL,ESP__DIABETES,ESP__INMUNOLOGÍA_ADULTO,PRE__VENOSA_EN_ADULTOS,PRE__CONSULTA_DE_PSIQUIATRIA_(A_DISTANCIA_POR_CONTINGENCIA_COVID-19),PRE__CONSULTA_O_CONTROL_MÉDICO_(HOSP.TIPO_3)_(A_DISTANCIA_POR_CONTINGENCIA_COVID-19),PRE__CONSULTA_DE_SALUD_MENTAL_POR_OTROS_PROFESIONALES_(VÍA_LLAMADA_TELEFÓNICA_POR_CONTINGENCIA_COVID-19),"PRE__CONSULTA_INTEGRAL_DE_ESPECIALIDADES_EN_MEDIC_INTERNA_Y_SUBESP,_OFTALMO,_NEUROLO,_ONCOLOGIA_EN_CDT_(A_DISTANCIA_POR_CONTINGENCIA_COVID-19)"
0,26.46167,267.476734,112.094873,61.951623,18.718742,37.236471,79.158881,68.612841,18.120722,28.297498,...,,,,,,,,,,


### Modelos especificos (24 PIT)

In [199]:
## Especialidad

In [193]:
lst_aux_data = []

for x in range(24):
    
    pit = dparser.parse('2019-01-01', fuzzy=True) +  dateutil.relativedelta.relativedelta(months=x)
    
    start_pred_data = pit
    end_pred_data = pit +  dateutil.relativedelta.relativedelta(months=1) -  dateutil.relativedelta.relativedelta(days=1)

    
    start_train_data = pit -  dateutil.relativedelta.relativedelta(months=12) 
    end_train_data = pit - dateutil.relativedelta.relativedelta(days=1)
    
    lst_aux_data.append({'pit':pit, 
                         'start_pred_data': start_pred_data,
                         'end_pred_data': end_pred_data,
                         'start_train_data': start_train_data,
                         'end_train_data': end_train_data})
    
df_aux_dt_data = pd.DataFrame(lst_aux_data)    

In [194]:
df_type = pd.DataFrame(df.loc[df['ESPECIALIDAD'] != -1]['ESPECIALIDAD'].value_counts(normalize=True)).reset_index()
df_type = df_type.loc[df_type['ESPECIALIDAD'] >= 0.05]
df_type

Unnamed: 0,index,ESPECIALIDAD
0,Med. Interna,0.089182
1,Psiquiatría Adulto,0.08288
2,Hematología Adulto,0.056699


In [195]:
for tp in  df_type['index']:
    print(tp)

Med. Interna
Psiquiatría Adulto
Hematología Adulto


In [196]:
list_df_pit_fi = []

for tp in df_type['index']:
    
    print('-'*50)
    print('TP: ', tp)

    for _, pit, start_pred_data, end_pred_data, start_train_data, end_train_data in df_aux_dt_data.itertuples():

        print('PIT: ', pit)

        df_pit_train_data = df[(df['DATE_CITA'] >= start_train_data) & (df['DATE_CITA'] <= end_train_data) & (df['ESPECIALIDAD'] == tp)]
        df_pit_predict_data = df[(df['DATE_CITA'] >= start_pred_data) & (df['DATE_CITA'] <= end_pred_data) & (df['ESPECIALIDAD'] == tp)]

        print(f'[TRAIN] Start date: {start_train_data} - End date: {end_train_data} - N rows: {len(df_pit_train_data)}')

        # https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0
        for cat in ['NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
            df_pit_train_data[cat] = df_pit_train_data[cat].str.replace(' ', '_')
            df_pit_train_data[cat] = df_pit_train_data[cat].\
                                        mask(df_pit_train_data[cat].\
                                        map(df_pit_train_data[cat].value_counts(normalize=True)) < 0.01, 'Other')

            df_pit_predict_data[cat] = df_pit_predict_data[cat].str.replace(' ', '_')
            df_pit_predict_data[cat] = df_pit_predict_data[cat].\
                                        mask(df_pit_predict_data[cat].\
                                        map(df_pit_predict_data[cat].value_counts(normalize=True)) < 0.01, 'Other')


        # One hot encoder
        for cat in ['SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
            df_dummies = pd.get_dummies(df_pit_train_data[cat].str.upper(), prefix=cat[:3] + '_')
            df_pit_train_data = pd.concat([df_pit_train_data, df_dummies], axis=1)

            df_dummies = pd.get_dummies(df_pit_predict_data[cat].str.upper(), prefix=cat[:3] + '_')
            df_pit_predict_data = pd.concat([df_pit_predict_data, df_dummies], axis=1)


        # Agregamos las columnas como 0 para que el predict y el train tengan las mismas columnas
        for falta in set(df_pit_train_data.columns) - set(df_pit_predict_data.columns):
            df_pit_predict_data[falta] = 0

        for falta in set(df_pit_predict_data.columns) - set(df_pit_train_data.columns):
            df_pit_train_data[falta] = 0

        # Utilizando el peso de la clase postivia
        weight = df_pit_train_data['TARGET'].value_counts().max()/df_pit_train_data['TARGET'].value_counts().min()

        classifier = xgb.XGBClassifier(eval_metric='logloss', 
                                       verbosity=0,
                                       scale_pos_weight=weight,
                                       seed=42)

        X_train = df_pit_train_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                          'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
        y_train = df_pit_train_data['TARGET']

        X_test = df_pit_predict_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                           'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
        y_test = df_pit_predict_data['TARGET']

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict_proba(X_test)
        print(basic_metrics(y_pred, y_test))

        # Creo un dataframe con las features importance
        df_fi = pd.DataFrame([classifier._Booster.get_score(importance_type='gain')])
        df_fi['PIT'] = pit
        df_fi['TP'] = tp

        list_df_pit_fi.append(df_fi)

        print()

--------------------------------------------------
TP:  Med. Interna
PIT:  2019-01-01 00:00:00
[TRAIN] Start date: 2018-01-01 00:00:00 - End date: 2018-12-31 00:00:00 - N rows: 76841
ROC: 0.6438792136547745 - Precision: 0.1707933299646286 - % Positivos: 0.22868037901548416 - % f1score: 0.24671532846715327

PIT:  2019-02-01 00:00:00
[TRAIN] Start date: 2018-02-01 00:00:00 - End date: 2019-01-31 00:00:00 - N rows: 78797
ROC: 0.5757680280129901 - Precision: 0.17452830188679244 - % Positivos: 0.22110203372153658 - % f1score: 0.22379032258064518

PIT:  2019-03-01 00:00:00
[TRAIN] Start date: 2018-03-01 00:00:00 - End date: 2019-02-28 00:00:00 - N rows: 78628
ROC: 0.7150294706157252 - Precision: 0.21390086206896552 - % Positivos: 0.285275130648632 - % f1score: 0.30882924931933103

PIT:  2019-04-01 00:00:00
[TRAIN] Start date: 2018-04-01 00:00:00 - End date: 2019-03-31 00:00:00 - N rows: 78703
ROC: 0.7133615431246337 - Precision: 0.20937363118703461 - % Positivos: 0.3417153120790301 - % f1sco

[TRAIN] Start date: 2018-11-01 00:00:00 - End date: 2019-10-31 00:00:00 - N rows: 75862
ROC: 0.6277513650248725 - Precision: 0.2656918687589158 - % Positivos: 0.4939228465738947 - % f1score: 0.3738083291520321

PIT:  2019-12-01 00:00:00
[TRAIN] Start date: 2018-12-01 00:00:00 - End date: 2019-11-30 00:00:00 - N rows: 75129
ROC: 0.61642478862186 - Precision: 0.25187865293626494 - % Positivos: 0.5461316309469524 - % f1score: 0.36743808363784003

PIT:  2020-01-01 00:00:00
[TRAIN] Start date: 2019-01-01 00:00:00 - End date: 2019-12-31 00:00:00 - N rows: 76223
ROC: 0.5569720236051247 - Precision: 0.2514124293785311 - % Positivos: 0.252821025567776 - % f1score: 0.28709677419354845

PIT:  2020-02-01 00:00:00
[TRAIN] Start date: 2019-02-01 00:00:00 - End date: 2020-01-31 00:00:00 - N rows: 77096
ROC: 0.5022374549437548 - Precision: 0.2094240837696335 - % Positivos: 0.034639100471527025 - % f1score: 0.06557377049180327

PIT:  2020-03-01 00:00:00
[TRAIN] Start date: 2019-03-01 00:00:00 - End dat

[TRAIN] Start date: 2019-10-01 00:00:00 - End date: 2020-09-30 00:00:00 - N rows: 45269
ROC: 0.6875759700794764 - Precision: 0.0 - % Positivos: 0.0033435497353023124 - % f1score: 0.0

PIT:  2020-11-01 00:00:00
[TRAIN] Start date: 2019-11-01 00:00:00 - End date: 2020-10-31 00:00:00 - N rows: 44044
ROC: 0.5946958684627063 - Precision: 0.125 - % Positivos: 0.002176278563656148 - % f1score: 0.047619047619047616

PIT:  2020-12-01 00:00:00
[TRAIN] Start date: 2019-12-01 00:00:00 - End date: 2020-11-30 00:00:00 - N rows: 43328
ROC: 0.8278996817468898 - Precision: 0.045454545454545456 - % Positivos: 0.0056453682319733125 - % f1score: 0.03571428571428571



In [197]:
df_fi = pd.concat(list_df_pit_fi)

In [198]:
df_fi.sample()

Unnamed: 0,NSP_365D,PREVIOUS_APP_90D,EDAD,MINUTE_DAY,"PRE__CONSULTA_INTEGRAL_DE_ESPECIALIDADES_EN_MEDIC_INTERNA_Y_SUBESP,_OFTALMO,_NEUROLO,_ONCOLOGIA_EN_CDT",NSP_30D,TIP__MÉDICO,PREVIOUS_APP_365D,TIP__KINESIÓLOGO_(A),TIP__ENFERMERA_(O),...,TIP__PSIQUIATRÍA_ADULTOS,PRE__CONSULTA_DE_SALUD_MENTAL_POR_OTROS_PROFESIONALES,PRE__CONSULTA_SM_POR__ENFERMERA,TIP__ASISTENTE_SOCIAL,PRE__INTERCONSULTA_DE_ENLACE_PSIQUIATRIA_,PRE__CONSULTA_DE_SALUD_MENTAL_POR_OTROS_PROFESIONALES_(VÍA_LLAMADA_TELEFÓNICA_POR_CONTINGENCIA_COVID-19),APP_VIDEOCALL,PRE__CONSULTA_O_CONTROL_POR_PSICOLOGO_CLINICO_(VÍA_LLAMADA_TELEFÓNICA_POR_CONTINGENCIA_COVID-19),PRE__CONSULTA_SM_POR_TRABAJADOR_SOCIAL_(VÍA_LLAMADA_TELEFÓNICA_POR_CONTINGENCIA_COVID-19),PRE__CONSULTA_SM_POR__TERAPEUTA_OCUPACIONAL_(VÍA_LLAMADA_TELEFÓNICA_POR_CONTINGENCIA_COVID-19)
0,19.422307,12.417398,16.014704,15.371288,,23.053983,14.32934,17.478815,14.141326,28.478526,...,11.800848,20.126839,11.309922,20.089227,36.118449,9.037653,5.58945,10.991445,,


In [201]:
df_fi.to_csv('tipo_especialidad.csv', index=False)

## Tipo de profesional

In [202]:
df_type = pd.DataFrame(df.loc[df['TIPO_PROFESIONAL'] != -1]['TIPO_PROFESIONAL'].value_counts(normalize=True)).reset_index()
df_type = df_type.loc[df_type['TIPO_PROFESIONAL'] >= 0.05]
df_type

Unnamed: 0,index,TIPO_PROFESIONAL
0,Médico,0.557169
1,Enfermera (o),0.099498
2,Kinesiólogo (a),0.057399


In [203]:
list_df_pit_fi = []

for tp in df_type['index']:
    
    print('-'*50)
    print('TP: ', tp)

    for _, pit, start_pred_data, end_pred_data, start_train_data, end_train_data in df_aux_dt_data.itertuples():

        print('PIT: ', pit)

        df_pit_train_data = df[(df['DATE_CITA'] >= start_train_data) & (df['DATE_CITA'] <= end_train_data) & (df['TIPO_PROFESIONAL'] == tp)]
        df_pit_predict_data = df[(df['DATE_CITA'] >= start_pred_data) & (df['DATE_CITA'] <= end_pred_data) & (df['TIPO_PROFESIONAL'] == tp)]

        print(f'[TRAIN] Start date: {start_train_data} - End date: {end_train_data} - N rows: {len(df_pit_train_data)}')

        # https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0
        for cat in ['NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
            df_pit_train_data[cat] = df_pit_train_data[cat].str.replace(' ', '_')
            df_pit_train_data[cat] = df_pit_train_data[cat].\
                                        mask(df_pit_train_data[cat].\
                                        map(df_pit_train_data[cat].value_counts(normalize=True)) < 0.01, 'Other')

            df_pit_predict_data[cat] = df_pit_predict_data[cat].str.replace(' ', '_')
            df_pit_predict_data[cat] = df_pit_predict_data[cat].\
                                        mask(df_pit_predict_data[cat].\
                                        map(df_pit_predict_data[cat].value_counts(normalize=True)) < 0.01, 'Other')


        # One hot encoder
        for cat in ['SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
            df_dummies = pd.get_dummies(df_pit_train_data[cat].str.upper(), prefix=cat[:3] + '_')
            df_pit_train_data = pd.concat([df_pit_train_data, df_dummies], axis=1)

            df_dummies = pd.get_dummies(df_pit_predict_data[cat].str.upper(), prefix=cat[:3] + '_')
            df_pit_predict_data = pd.concat([df_pit_predict_data, df_dummies], axis=1)


        # Agregamos las columnas como 0 para que el predict y el train tengan las mismas columnas
        for falta in set(df_pit_train_data.columns) - set(df_pit_predict_data.columns):
            df_pit_predict_data[falta] = 0

        for falta in set(df_pit_predict_data.columns) - set(df_pit_train_data.columns):
            df_pit_train_data[falta] = 0

        # Utilizando el peso de la clase postivia
        weight = df_pit_train_data['TARGET'].value_counts().max()/df_pit_train_data['TARGET'].value_counts().min()

        classifier = xgb.XGBClassifier(eval_metric='logloss', 
                                       verbosity=0,
                                       scale_pos_weight=weight,
                                       seed=42)

        X_train = df_pit_train_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                          'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
        y_train = df_pit_train_data['TARGET']

        X_test = df_pit_predict_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                           'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
        y_test = df_pit_predict_data['TARGET']

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict_proba(X_test)
        print(basic_metrics(y_pred, y_test))

        # Creo un dataframe con las features importance
        df_fi = pd.DataFrame([classifier._Booster.get_score(importance_type='gain')])
        df_fi['PIT'] = pit
        df_fi['TP'] = tp

        list_df_pit_fi.append(df_fi)

        print()

--------------------------------------------------
TP:  Médico
PIT:  2019-01-01 00:00:00
[TRAIN] Start date: 2018-01-01 00:00:00 - End date: 2018-12-31 00:00:00 - N rows: 653011
ROC: 0.6795057749366786 - Precision: 0.19741885400999468 - % Positivos: 0.34023903067120076 - % f1score: 0.2921138590760616

PIT:  2019-02-01 00:00:00
[TRAIN] Start date: 2018-02-01 00:00:00 - End date: 2019-01-31 00:00:00 - N rows: 662775
ROC: 0.6504589743468135 - Precision: 0.21658245291685807 - % Positivos: 0.29064773814556866 - % f1score: 0.29963459196102316

PIT:  2019-03-01 00:00:00
[TRAIN] Start date: 2018-03-01 00:00:00 - End date: 2019-02-28 00:00:00 - N rows: 665146
ROC: 0.694052217686576 - Precision: 0.21047040075547424 - % Positivos: 0.29165303909248963 - % f1score: 0.3001809840481502

PIT:  2019-04-01 00:00:00
[TRAIN] Start date: 2018-04-01 00:00:00 - End date: 2019-03-31 00:00:00 - N rows: 667508
ROC: 0.6353340246804979 - Precision: 0.1668195578387304 - % Positivos: 0.3652415734101722 - % f1score:

[TRAIN] Start date: 2018-11-01 00:00:00 - End date: 2019-10-31 00:00:00 - N rows: 115199
ROC: 0.7038795185307695 - Precision: 0.11372798642053894 - % Positivos: 0.47257595507871253 - % f1score: 0.19658903355950852

PIT:  2019-12-01 00:00:00
[TRAIN] Start date: 2018-12-01 00:00:00 - End date: 2019-11-30 00:00:00 - N rows: 116087
ROC: 0.5729163797055878 - Precision: 0.08903757076685538 - % Positivos: 0.18116550116550117 - % f1score: 0.12518089725036177

PIT:  2020-01-01 00:00:00
[TRAIN] Start date: 2019-01-01 00:00:00 - End date: 2019-12-31 00:00:00 - N rows: 118123
ROC: 0.7321679273556446 - Precision: 0.13620414673046252 - % Positivos: 0.25267993874425726 - % f1score: 0.21713704551233154

PIT:  2020-02-01 00:00:00
[TRAIN] Start date: 2019-02-01 00:00:00 - End date: 2020-01-31 00:00:00 - N rows: 120684
ROC: 0.5465543990390985 - Precision: 0.05341614906832298 - % Positivos: 0.1454775458570525 - % f1score: 0.07682000893255918

PIT:  2020-03-01 00:00:00
[TRAIN] Start date: 2019-03-01 00:00:

ROC: 0.6280832135331533 - Precision: 0.2375 - % Positivos: 0.16137165910237014 - % f1score: 0.2753623188405797

PIT:  2020-10-01 00:00:00
[TRAIN] Start date: 2019-10-01 00:00:00 - End date: 2020-09-30 00:00:00 - N rows: 41305
ROC: 0.6063105266762551 - Precision: 0.24607329842931938 - % Positivos: 0.14475179992421372 - % f1score: 0.27011494252873564

PIT:  2020-11-01 00:00:00
[TRAIN] Start date: 2019-11-01 00:00:00 - End date: 2020-10-31 00:00:00 - N rows: 36730
ROC: 0.5905315659572625 - Precision: 0.17857142857142858 - % Positivos: 0.28672 - % f1score: 0.245398773006135

PIT:  2020-12-01 00:00:00
[TRAIN] Start date: 2019-12-01 00:00:00 - End date: 2020-11-30 00:00:00 - N rows: 33105
ROC: 0.654940146884739 - Precision: 0.2038765254845657 - % Positivos: 0.503433321286592 - % f1score: 0.31260319207484866



In [204]:
df_fi = pd.concat(list_df_pit_fi)

In [205]:
df_fi.sample()

Unnamed: 0,ESP__HEMATOLOGÍA_ADULTO,NSP_365D,PREVIOUS_APP_365D,EDAD,ESP__OBSTETRICIA,PRE__OTHER,ESP__CARDIOLOGÍA_ADULTO,ESP__REUMATOLOGÍA,ESP__INMUNOLOGÍA_ADULTO,MINUTE_DAY,...,"PRE__CITODIAGNOSTICO_CORRIENTE,_EXFOLIATIVA_(PAPANICOLAU_Y_SIMILARES)_(POR_CADA_ÓRGANO)",ESP__CIRUGÍA_ABDOMINAL,PRE__CONSULTA_SM_POR__ENFERMERA,PRE__REACCION_CUTANEA_16_ALERGENOS_POR_ESCARIFICACION_(INCLUYE_EL_VALOR_DE_LOS_ANTIGENOS),PRE__VENOSA_EN_ADULTOS,ESP__SALUD_MENTAL,PRE__CONSULTAS_Y_CONTROLES_POR_OTROS_PROFESIONALES_EN_ESPECIALIDAD_(NIVEL_SECUNDARIO)_-_PROFESIONAL_-_KINESIÓLOGO,PRE__ATENCIÓN_KINESIOLÓGICA_INTEGRAL_AMBULATORIA,"PRE__INTERVENCION_PSICOSOCIAL_GRUPAL_(4_A_8_PACIENTES,_FAMILIARES_O_CUIDADORES)",NAC__VENEZUELA
0,,28.912444,14.379688,43.808694,,16.137822,,,,17.614209,...,,,,,,,21.794667,14.276705,18.702757,


In [206]:
df_fi.to_csv('tipo_profesional.csv', index=False)

## Prestación

In [208]:
df_type = pd.DataFrame(df.loc[df['PRESTACION'] != -1]['PRESTACION'].value_counts(normalize=True)).reset_index()
df_type = df_type.loc[df_type['PRESTACION'] >= 0.05]
df_type

Unnamed: 0,index,PRESTACION
0,Consulta Integral De Especialidades En Medic I...,0.161245
1,"Consulta O Control Por Enfermera, Matrona O Nu...",0.120949
2,Consulta Medica Integral En C.R.S.,0.10805
3,Consulta Integral De Especialidades En Cirugia...,0.060034


In [209]:
list_df_pit_fi = []

for tp in df_type['index']:
    
    print('-'*50)
    print('TP: ', tp)

    for _, pit, start_pred_data, end_pred_data, start_train_data, end_train_data in df_aux_dt_data.itertuples():

        print('PIT: ', pit)

        df_pit_train_data = df[(df['DATE_CITA'] >= start_train_data) & (df['DATE_CITA'] <= end_train_data) & (df['PRESTACION'] == tp)]
        df_pit_predict_data = df[(df['DATE_CITA'] >= start_pred_data) & (df['DATE_CITA'] <= end_pred_data) & (df['PRESTACION'] == tp)]

        print(f'[TRAIN] Start date: {start_train_data} - End date: {end_train_data} - N rows: {len(df_pit_train_data)}')

        # https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0
        for cat in ['NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
            df_pit_train_data[cat] = df_pit_train_data[cat].str.replace(' ', '_')
            df_pit_train_data[cat] = df_pit_train_data[cat].\
                                        mask(df_pit_train_data[cat].\
                                        map(df_pit_train_data[cat].value_counts(normalize=True)) < 0.01, 'Other')

            df_pit_predict_data[cat] = df_pit_predict_data[cat].str.replace(' ', '_')
            df_pit_predict_data[cat] = df_pit_predict_data[cat].\
                                        mask(df_pit_predict_data[cat].\
                                        map(df_pit_predict_data[cat].value_counts(normalize=True)) < 0.01, 'Other')


        # One hot encoder
        for cat in ['SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
            df_dummies = pd.get_dummies(df_pit_train_data[cat].str.upper(), prefix=cat[:3] + '_')
            df_pit_train_data = pd.concat([df_pit_train_data, df_dummies], axis=1)

            df_dummies = pd.get_dummies(df_pit_predict_data[cat].str.upper(), prefix=cat[:3] + '_')
            df_pit_predict_data = pd.concat([df_pit_predict_data, df_dummies], axis=1)


        # Agregamos las columnas como 0 para que el predict y el train tengan las mismas columnas
        for falta in set(df_pit_train_data.columns) - set(df_pit_predict_data.columns):
            df_pit_predict_data[falta] = 0

        for falta in set(df_pit_predict_data.columns) - set(df_pit_train_data.columns):
            df_pit_train_data[falta] = 0

        # Utilizando el peso de la clase postivia
        weight = df_pit_train_data['TARGET'].value_counts().max()/df_pit_train_data['TARGET'].value_counts().min()

        classifier = xgb.XGBClassifier(eval_metric='logloss', 
                                       verbosity=0,
                                       scale_pos_weight=weight,
                                       seed=42)

        X_train = df_pit_train_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                          'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
        y_train = df_pit_train_data['TARGET']

        X_test = df_pit_predict_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                           'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
        y_test = df_pit_predict_data['TARGET']

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict_proba(X_test)
        print(basic_metrics(y_pred, y_test))

        # Creo un dataframe con las features importance
        df_fi = pd.DataFrame([classifier._Booster.get_score(importance_type='gain')])
        df_fi['PIT'] = pit
        df_fi['TP'] = tp

        list_df_pit_fi.append(df_fi)

        print()

--------------------------------------------------
TP:  Consulta Integral De Especialidades En Medic Interna Y Subesp, Oftalmo, Neurolo, Oncologia En Cdt
PIT:  2019-01-01 00:00:00
[TRAIN] Start date: 2018-01-01 00:00:00 - End date: 2018-12-31 00:00:00 - N rows: 228770
ROC: 0.6286584339943468 - Precision: 0.14182257091128544 - % Positivos: 0.07727823897024531 - % f1score: 0.1380323054331865

PIT:  2019-02-01 00:00:00
[TRAIN] Start date: 2018-02-01 00:00:00 - End date: 2019-01-31 00:00:00 - N rows: 232690
ROC: 0.5870729464529956 - Precision: 0.17293233082706766 - % Positivos: 0.05744785881772183 - % f1score: 0.13993915688830944

PIT:  2019-03-01 00:00:00
[TRAIN] Start date: 2018-03-01 00:00:00 - End date: 2019-02-28 00:00:00 - N rows: 232745
ROC: 0.5310936512677263 - Precision: 0.18099547511312217 - % Positivos: 0.033392092671871064 - % f1score: 0.10864644635581709

PIT:  2019-04-01 00:00:00
[TRAIN] Start date: 2018-04-01 00:00:00 - End date: 2019-03-31 00:00:00 - N rows: 232283
ROC: 0.5

[TRAIN] Start date: 2018-10-01 00:00:00 - End date: 2019-09-30 00:00:00 - N rows: 170926
ROC: 0.8032004284875299 - Precision: 0.24145633236542327 - % Positivos: 0.30026827632461434 - % f1score: 0.357473544973545

PIT:  2019-11-01 00:00:00
[TRAIN] Start date: 2018-11-01 00:00:00 - End date: 2019-10-31 00:00:00 - N rows: 169672
ROC: 0.6129657244479791 - Precision: 0.14566719829877725 - % Positivos: 0.2624712202609363 - % f1score: 0.21232080588919025

PIT:  2019-12-01 00:00:00
[TRAIN] Start date: 2018-12-01 00:00:00 - End date: 2019-11-30 00:00:00 - N rows: 169742
ROC: 0.5646427471335511 - Precision: 0.10398104265402844 - % Positivos: 0.7160795493110704 - % f1score: 0.18361369152230314

PIT:  2020-01-01 00:00:00
[TRAIN] Start date: 2019-01-01 00:00:00 - End date: 2019-12-31 00:00:00 - N rows: 171586
ROC: 0.7376051919524687 - Precision: 0.20107763899093803 - % Positivos: 0.23266282979087127 - % f1score: 0.2947935368043088

PIT:  2020-02-01 00:00:00
[TRAIN] Start date: 2019-02-01 00:00:00 -

[TRAIN] Start date: 2019-08-01 00:00:00 - End date: 2020-07-31 00:00:00 - N rows: 127208
ROC: 0.6112129709696823 - Precision: 0.19682726204465334 - % Positivos: 0.20827214880078315 - % f1score: 0.2489780750650316

PIT:  2020-09-01 00:00:00
[TRAIN] Start date: 2019-09-01 00:00:00 - End date: 2020-08-31 00:00:00 - N rows: 121883
ROC: 0.63551570531002 - Precision: 0.18549406115744252 - % Positivos: 0.3980484860678 - % f1score: 0.27993897787948135

PIT:  2020-10-01 00:00:00
[TRAIN] Start date: 2019-10-01 00:00:00 - End date: 2020-09-30 00:00:00 - N rows: 121212
ROC: 0.7291434249278195 - Precision: 0.2214828897338403 - % Positivos: 0.3940074906367041 - % f1score: 0.334529791816224

PIT:  2020-11-01 00:00:00
[TRAIN] Start date: 2019-11-01 00:00:00 - End date: 2020-10-31 00:00:00 - N rows: 119109
ROC: 0.5790243110599723 - Precision: 0.16638434029738378 - % Positivos: 0.46785840084536806 - % f1score: 0.25851732709460445

PIT:  2020-12-01 00:00:00
[TRAIN] Start date: 2019-12-01 00:00:00 - End d

In [210]:
df_fi = pd.concat(list_df_pit_fi)

In [211]:
df_fi.sample()

Unnamed: 0,ESP__HEMATOLOGÍA_ADULTO,NSP_365D,PREVIOUS_APP_365D,ESP__CARDIOLOGÍA_ADULTO,ESP__ENFERMEDAD_RESPIRATORIA_DE_ADULTO_(BRONCOPULMONAR),EDAD,MINUTE_DAY,TIP__MÉDICO_CIRUJANO,ESP__ONCOLOGÍA_ADULTO,ESP__INMUNOLOGÍA_ADULTO,...,ESP__ANESTESIOLOGÍA,ESP__CIRUGÍA_VASCULAR_PERIFÉRICA,ESP__UNIDAD_DE_PATOLOGÍA_CERVICAL_,ESP__UNIDAD_DE_PATOLOGÍA_MAMARIA,ESP__CIRUGÍA_ABDOMINAL_ADULTO_,ESP__CIRUGÍA_MÁXILO_FACIAL,ESP__ANESTESIOLOGÍA_ADULTO,ESP__COLOPROCTOLOGÍA,ESP__CIRUGÍA_PLÁSTICA_Y_REPARADORA,ESP__CIRUGÍA_TÓRAX_
0,2818.21984,106.52081,54.060921,89.190901,135.405992,30.296699,53.841548,42.519914,120.956796,108.452364,...,,,,,,,,,,


In [2]:
### Versus una heuristica (TOP NSP365)

In [3]:
import dateutil.relativedelta
import dateutil.parser as dparser

In [7]:
df_2018 = pd.read_csv('export_TRAINSET_FILTERED_2018_20211016.csv')
df_2019 = pd.read_csv('export_TRAINSET_FILTERED_2019_20211016.csv')
df_2020 = pd.read_csv('export_TRAINSET_FILTERED_2020_20211016.csv')

In [8]:
df = pd.concat([df_2018, df_2019, df_2020])

In [9]:
df = df[['EDAD', 'SEXO',
         'DATE_CITA', 'MINUTE_DAY', 'DATE_OF_WEEK', 
         'APP_DISTANCE', 'APP_CALL', 'APP_VIDEOCALL',
         'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD',
         'NSP_30D', 'NSP_60D', 'NSP_90D', 'NSP_120D', 'NSP_365D', 
         'PREVIOUS_APP_30D', 'PREVIOUS_APP_60D', 'PREVIOUS_APP_90D', 'PREVIOUS_APP_120D', 'PREVIOUS_APP_365D', 
         'TARGET']]


In [10]:
#Pareaseamos la fecha
df['DATE_CITA'] = pd.to_datetime(df['DATE_CITA'])

# Nos quedamos con 10k ejemplo para poder probar más rápido
#df = df.sample(100000, random_state=45)

# Imputamos -1 a los pacientes que no tienen NSP pasado
df = df.fillna(-1)

In [11]:
lst_aux_data = []

for x in range(24):
    
    pit = dparser.parse('2019-01-01', fuzzy=True) +  dateutil.relativedelta.relativedelta(months=x)
    
    start_pred_data = pit
    end_pred_data = pit +  dateutil.relativedelta.relativedelta(months=1) -  dateutil.relativedelta.relativedelta(days=1)

    
    start_train_data = pit -  dateutil.relativedelta.relativedelta(months=12) 
    end_train_data = pit - dateutil.relativedelta.relativedelta(days=1)
    
    lst_aux_data.append({'pit':pit, 
                         'start_pred_data': start_pred_data,
                         'end_pred_data': end_pred_data,
                         'start_train_data': start_train_data,
                         'end_train_data': end_train_data})
    
df_aux_dt_data = pd.DataFrame(lst_aux_data)    

In [22]:
df_aux_dt_data.sample(5)

Unnamed: 0,pit,start_pred_data,end_pred_data,start_train_data,end_train_data
0,2019-01-01,2019-01-01,2019-01-31,2018-01-01,2018-12-31
3,2019-04-01,2019-04-01,2019-04-30,2018-04-01,2019-03-31
16,2020-05-01,2020-05-01,2020-05-31,2019-05-01,2020-04-30
13,2020-02-01,2020-02-01,2020-02-29,2019-02-01,2020-01-31
2,2019-03-01,2019-03-01,2019-03-31,2018-03-01,2019-02-28


In [133]:
list_df_pit_fi = []

for _, pit, start_pred_data, end_pred_data, start_train_data, end_train_data in df_aux_dt_data.itertuples():
    
    print('PIT: ', pit)
    
    df_pit_train_data = df[(df['DATE_CITA'] >= start_train_data) & (df['DATE_CITA'] <= end_train_data)]
    df_pit_predict_data = df[(df['DATE_CITA'] >= start_pred_data) & (df['DATE_CITA'] <= end_pred_data)]

    print(f'[TRAIN] Start date: {start_train_data} - End date: {end_train_data} - N rows: {len(df_pit_train_data)}')
    
    # https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0
    for cat in ['NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
        df_pit_train_data[cat] = df_pit_train_data[cat].str.replace(' ', '_')
        df_pit_train_data[cat] = df_pit_train_data[cat].\
                                    mask(df_pit_train_data[cat].\
                                    map(df_pit_train_data[cat].value_counts(normalize=True)) < 0.01, 'Other')
        
        df_pit_predict_data[cat] = df_pit_predict_data[cat].str.replace(' ', '_')
        df_pit_predict_data[cat] = df_pit_predict_data[cat].\
                                    mask(df_pit_predict_data[cat].\
                                    map(df_pit_predict_data[cat].value_counts(normalize=True)) < 0.01, 'Other')
        
        
    # One hot encoder
    for cat in ['SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL', 'PRESTACION', 'ESPECIALIDAD']:
        df_dummies = pd.get_dummies(df_pit_train_data[cat].str.upper(), prefix=cat[:3] + '_')
        df_pit_train_data = pd.concat([df_pit_train_data, df_dummies], axis=1)

        df_dummies = pd.get_dummies(df_pit_predict_data[cat].str.upper(), prefix=cat[:3] + '_')
        df_pit_predict_data = pd.concat([df_pit_predict_data, df_dummies], axis=1)

    
    # Agregamos las columnas como 0 para que el predict y el train tengan las mismas columnas
    for falta in set(df_pit_train_data.columns) - set(df_pit_predict_data.columns):
        df_pit_predict_data[falta] = 0
        
    for falta in set(df_pit_predict_data.columns) - set(df_pit_train_data.columns):
        df_pit_train_data[falta] = 0
        
    # Utilizando el peso de la clase postivia
    weight = df_pit_train_data['TARGET'].value_counts().max()/df_pit_train_data['TARGET'].value_counts().min()
    
    classifier = xgb.XGBClassifier(eval_metric='logloss', 
                                   verbosity=0,
                                   scale_pos_weight=weight,
                                   seed=42)

    X_train = df_pit_train_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                      'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
    y_train = df_pit_train_data['TARGET']
    
    X_test = df_pit_predict_data.drop(['TARGET', 'SEXO', 'NACIONALIDAD', 'TIPO_PROFESIONAL',
                                       'PRESTACION', 'ESPECIALIDAD', 'DATE_CITA'], axis='columns')
    y_test = df_pit_predict_data['TARGET']
    
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict_proba(X_test)
    print(basic_metrics(y_pred, y_test))
    
    # Creo un dataframe con las features importance
    df_fi = pd.DataFrame([classifier._Booster.get_score(importance_type='gain')])
    df_fi['PIT'] = pit

    list_df_pit_fi.append(df_fi)

    print()
    break

PIT:  2019-01-01 00:00:00
[TRAIN] Start date: 2018-01-01 00:00:00 - End date: 2018-12-31 00:00:00 - N rows: 1452117
ROC: 0.6921464843257872 - Precision: 0.20799326337608498 - % Positivos: 0.36461400387334475 - % f1score: 0.3112338858195211



In [135]:
for prob in [round(int(x)*0.1, 1) for x in np.linspace(1, 9, 9)]:
    
    print(prob)
    predictions = [1 if x[1] >= prob else 0 for x in y_pred]
    
    n_top = sum(predictions)
    prec_modelo = precision_score(y_test, predictions)
    rec_modelo = recall_score(y_test, predictions)
    f1_modelo = f1_score(y_test, predictions)
    

    df_team = df_pit_predict_data[['TARGET', 'NSP_365D']].sort_values('NSP_365D', ascending=False).reset_index(drop=True).reset_index()
    df_team['PRED'] = df_team['index'].apply(lambda x: 1 if x <= n_top else 0)
    
    prec_heur = precision_score(df_team['TARGET'], df_team['PRED'])
    rec_heur = recall_score(df_team['TARGET'], df_team['PRED'])
    f1_heur = f1_score(df_team['TARGET'], df_team['PRED'])
    
    print('N Seleccionado: ', n_top)
    print(f'- Precisión M: {round(prec_modelo, 3)} - Precisión H: {round(prec_heur, 3)}')
    print(f'- Recall M: {round(rec_modelo, 3)} - Recall H: {round(rec_heur, 3)}')
    print(f'- F1 M: {round(f1_modelo, 3)} - F1 H: {round(f1_heur, 3)} - Lift: {round(f1_modelo/f1_heur - 1, 3)}')
    print()

    

0.1
N Seleccionado:  121472
- Precisión M: 0.126 - Precisión H: 0.118
- Recall M: 0.981 - Recall H: 0.918
- F1 M: 0.223 - F1 H: 0.209 - Lift: 0.068

0.2
N Seleccionado:  109833
- Precisión M: 0.135 - Precisión H: 0.116
- Recall M: 0.953 - Recall H: 0.816
- F1 M: 0.237 - F1 H: 0.203 - Lift: 0.168

0.3
N Seleccionado:  90917
- Precisión M: 0.151 - Precisión H: 0.125
- Recall M: 0.882 - Recall H: 0.731
- F1 M: 0.258 - F1 H: 0.214 - Lift: 0.206

0.4
N Seleccionado:  68454
- Precisión M: 0.176 - Precisión H: 0.142
- Recall M: 0.774 - Recall H: 0.623
- F1 M: 0.287 - F1 H: 0.231 - Lift: 0.242

0.5
N Seleccionado:  46314
- Precisión M: 0.208 - Precisión H: 0.174
- Recall M: 0.618 - Recall H: 0.516
- F1 M: 0.311 - F1 H: 0.26 - Lift: 0.197

0.6
N Seleccionado:  27021
- Precisión M: 0.247 - Precisión H: 0.214
- Recall M: 0.428 - Recall H: 0.371
- F1 M: 0.313 - F1 H: 0.271 - Lift: 0.154

0.7
N Seleccionado:  11790
- Precisión M: 0.295 - Precisión H: 0.265
- Recall M: 0.223 - Recall H: 0.2
- F1 M: 