# Importamos Librerías

In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import chi2_contingency
import UTILS_LT as LT
import time

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
import matplotlib.font_manager
from matplotlib import style
import seaborn as sns

%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

# Preprocesado y modelado
# ==============================================================================
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier
from sklearn import metrics

import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV,train_test_split, ParameterGrid,RepeatedKFold

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)


print('Importaciones finalizadas!')



Importaciones finalizadas!


# Lectura de Datos

In [4]:
tmp = pd.ExcelFile('PRESEEA_ACT.xlsx')
data = pd.read_excel(tmp)

## Preprocesado Inicial

In [30]:
# Nos quedamos únicamente con el contexto 3
data = data[data['CONTEXTOS OBLIGATORIOS'] == 3].reset_index(drop=True)
# Eliminamos algunas columnas sobrantes
eliminar = [x for x in data.columns if 'Unnamed' in x]
data = data.drop(eliminar, axis='columns')
# Eliminamos otras columnas que pueden despistarnos
eliminar2 = ['INFORMANTE', 
             'Minuto',
             'CONTEXTO ',
             'Forma verbal analizada',
             'Infinitivo',
             'CONTEXTOS OBLIGATORIOS']
data = data.drop(eliminar2, axis='columns')
# Eliminamos variables sociales y otras que no interesan
eliminar3 = data.columns[25:]
data = data.drop(eliminar3, axis='columns')
# Eliminamos los subtipos ADDESSE para evitar colinealidad
data = data.drop('Subtipo ADESSE', axis='columns')
# Pasar a entero, reducir uso de memoria
for col in data.columns:
    data[col] = data[col].astype('int8')

# Hacemos que el 1 sea presencia
data['Presencia'] = data['Presencia'].map({0:'a', 1:'b'}).map({'a':1, 'b':0})

In [36]:
# data.columns = [x.upper().strip().replace(' ','_') for x in data.columns]

['PRESENCIA',
 'PERSONA_DEL_SUJETO',
 'REFERENTE_DE_SUJETO',
 'AMBIGÜEDAD_FONÉTICA_POTENCIAL',
 'ESPECIFICIDAD',
 'MODO',
 'TIEMPO_VERBAL',
 'PROGRESIVIDAD',
 'PERFECTIVIDAD',
 'AMBIGÜEDAD_MORFOLOGICA',
 'CLASE_SEMANTICA_DEL_VB',
 'ADESSE',
 'TIPO_DE_CLAUSULA',
 'CORREFERENCIALIDAD',
 'DISTANCIA_DE_REFERENTE',
 'TURNO_DE_HABLA',
 'PERÍFRASIS',
 'PRONOMINAL',
 'TIPO_DE_DISCURSO',
 'LONGUITUD_DE_LA_FORMAL_VERBAL',
 'EDAD',
 'SEXO',
 'NIVEL_DE_ESTUDIOS',
 'CORPUS']

In [40]:
data_95 = data[data['CORPUS'] == 1].reset_index(drop=True)
data_15 = data[data['CORPUS'] == 2].reset_index(drop=True)

In [44]:
# para ver el número de categorías de una variable
len(data['clase semantica del vb '].unique())

7

In [48]:
# Aplicar ver el número de categorías a todas las variables
[(x, len(data[x].unique())) for x in data.columns]

[('Presencia', 2),
 ('Persona del sujeto', 9),
 ('Referente de sujeto', 4),
 ('Ambigüedad fonética potencial', 2),
 ('Especificidad', 2),
 ('Modo', 3),
 ('Tiempo verbal', 9),
 ('Progresividad', 2),
 ('Perfectividad', 2),
 ('Ambigüedad morfologica', 2),
 ('clase semantica del vb ', 7),
 ('ADESSE', 6),
 ('Tipo de clausula ', 6),
 ('Correferencialidad', 4),
 ('Distancia de referente', 5),
 ('Turno de habla', 2),
 ('Perífrasis', 3),
 ('Pronominal ', 2),
 ('Tipo de discurso', 4),
 ('Longuitud de la formal verbal', 10),
 ('Edad', 3),
 ('Sexo', 2),
 ('Nivel de estudios', 3),
 ('CORPUS', 1)]

# Tablas de Contigencia

In [57]:
pd.crosstab(data['Persona del sujeto'], data['Presencia'])

Presencia,0,1
Persona del sujeto,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,0
1,1638,903
2,637,227
3,1636,133
4,864,79
5,28,1
6,1127,60
7,14,36
8,1,4
9,0,1


In [72]:
def contingency_plus_chi2(df:pd.DataFrame,
                          var_independiente:str,
                          var_objetivo:str='Presencia'):                    
    contingency_table = pd.crosstab(data[var_independiente], data[var_objetivo])
    print(contingency_table)
    # Perform chi-squared test
    stat, p, dof, expected = chi2_contingency(contingency_table)
    print('Chi-squared statistic:', stat)
    print('p-value:', p)

    # Check if the relationship is significant
    alpha = 0.05 # significance level
    if p < alpha:
        print(f'{var_independiente} and {var_objetivo} are related (p={p:.3f})')
    else:
        print(f'{var_independiente} and {var_objetivo} are not related (p={p:.3f})')


In [76]:
pd.crosstab(data['Nivel de estudios'], data['Presencia']).to_excel('prueba.xlsx')

In [74]:
#Tabla de contingencia y chi cuadrado para hacerla por variables
contingency_plus_chi2(df=data_15, var_independiente='Nivel de estudios')


Presencia             0    1
Nivel de estudios           
1                  1727  452
2                  2149  544
3                  2070  448
Chi-squared statistic: 7.648888311577869
p-value: 0.02183056655329412
Nivel de estudios and Presencia are related (p=0.022)


In [77]:
related_vars = []
non_related_vars = []

# Define categorical variable and target variable
cat_var = [x for x in data.columns if x != 'Presencia']
target_var = 'Presencia'

# Create contingency table
for var in cat_var:
    contingency_table = pd.crosstab(data[var], data[target_var])
    # contingency_table.to_excel(f'CONTIGENCY_TABLES/{var}_vs_presencia.xlsx')
    print(contingency_table)

    # Perform chi-squared test
    stat, p, dof, expected = chi2_contingency(contingency_table)
    print('Chi-squared statistic:', stat)
    print('p-value:', p)

    # Check if the relationship is significant
    alpha = 0.05 # significance level
    if p < alpha:
        print(f'{var} and {target_var} are related (p={p:.3f})')
        related_vars.append(var)
    else:
        print(f'{var} and {target_var} are not related (p={p:.3f})')
        non_related_vars.append(var)
    
    print()
    print('*' * 70)
    print()
    # time.sleep(2)


Presencia              0    1
Persona del sujeto           
0                      1    0
1                   1638  903
2                    637  227
3                   1636  133
4                    864   79
5                     28    1
6                   1127   60
7                     14   36
8                      1    4
9                      0    1
Chi-squared statistic: 942.5710329307617
p-value: 4.1436329440134015e-197
Persona del sujeto and Presencia are related (p=0.000)

**********************************************************************

Presencia               0     1
Referente de sujeto            
1                    4864  1425
2                      18     0
3                     747    11
4                     317     8
Chi-squared statistic: 261.32651601773443
p-value: 2.3219470350455994e-56
Referente de sujeto and Presencia are related (p=0.000)

**********************************************************************

Presencia                         0    1
Am

# Análisis Descriptivo de los datos

In [None]:
# sexo parece ser no significativa
# la mantenemos para futuros análisis
related_vars.append('Sexo')

In [None]:
# chequear valores nulos
data.isna().sum().sum()

In [None]:
# data.corr(method='spearman')['PRESENCIA'].sort_values(ascending = False)
plt.figure(figsize=(8, 12))

heatmap = (
    sns.heatmap(data.corr(method='spearman')[['Presencia']].sort_values(by = 'Presencia',ascending = False),
        vmin=-1,
        vmax=1,
        annot=True,
        cmap='BrBG')
)

heatmap.set_title('Features Correlating with Presencia', fontdict={'fontsize':18}, pad=16);

In [None]:
plt.rcParams["figure.figsize"] = (20,20)

hm = sns.heatmap(data.corr(method='spearman'), annot = True)

hm.set( title = "Correlation matrix of ASL data\n")

plt.show()

In [None]:
var_ord_y_target = ['Presencia',
                    'Longuitud de la formal verbal',
                    'Edad',
                    'Nivel de estudios']

for var in data.columns:
    if var in var_ord_y_target:
        continue
    else:
        data = pd.concat([data, pd.get_dummies(data[var], prefix=var)], axis=1)
        data = data.drop(var, axis='columns')
        

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('Presencia', axis='columns'),
    data['Presencia'],
    train_size=0.8,
    random_state=1234,
    shuffle=True,
    stratify = data['Presencia']
)

In [None]:
# create pipeline
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=20)
model = LogisticRegression()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])



In [None]:
pipeline.fit(X_train, y_train)

In [None]:
scoring = ["accuracy", "balanced_accuracy", "f1", "roc_auc", 'precision','recall']

Model = DummyClassifier(strategy='most_frequent')

Model.fit(X_train, y_train)

index = []
scores = {"Accuracy": [], "Balanced accuracy": [], "F1-Score": [], "AUROC":[], "Precision":[], "Recall":[]}

index += ['Dummy Classifier']
cv_result = cross_validate(Model, X_test, y_test, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())
scores["F1-Score"].append(cv_result["test_f1"].mean())
scores["AUROC"].append(cv_result["test_roc_auc"].mean())
scores["Precision"].append(cv_result["test_precision"].mean())
scores["Recall"].append(cv_result["test_recall"].mean())

df_scores = pd.DataFrame(scores, index=index)
df_scores

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Crear una instancia de LogisticRegression
logistic_regression = LogisticRegression()

# Definir los valores de los hiperparámetros a probar
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'penalty': ['l1', 'l2'],
              'class_weight': [None, 'balanced']}

# Crear una instancia de GridSearchCV
grid_search = GridSearchCV(logistic_regression,
                           param_grid,
                           cv=5)

# Entrenar el modelo utilizando el conjunto de datos de entrenamiento
grid_search.fit(X_train, y_train)

In [None]:
logreg = grid_search.best_estimator_

In [None]:


# Grid de hiperparámetros evaluados
# # ==============================================================================
param_grid = { 
             'n_estimators': [100, 150, 200], 
             'max_depth': [None, 5, 7],
             'criterion': ['gini', 'entropy'],
             'class_weight': [None, 'balanced_subsample']
             }

# Búsqueda por grid search con validación cruzada
# ==============================================================================
grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=123),
    param_grid=param_grid,
    scoring='f1',
    n_jobs=multiprocessing.cpu_count() - 1,
    cv=5,
    refit=True,
    verbose=0,
    return_train_score=True
)

grid.fit(X=X_train, y=y_train)

modelo_final = grid.best_estimator_      

        
index += ['Random Forest']
cv_result = cross_validate(modelo_final, X_test, y_test, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())
scores["F1-Score"].append(cv_result["test_f1"].mean())
scores["AUROC"].append(cv_result["test_roc_auc"].mean())
scores["Precision"].append(cv_result["test_precision"].mean())
scores["Recall"].append(cv_result["test_recall"].mean())

df_scores = pd.DataFrame(scores, index=index)
df_scores

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Grid de hiperparámetros evaluados
# # ==============================================================================
param_grid = {  
             'max_depth': [None, 5, 7],
             'criterion': ['gini', 'entropy'],
             'class_weight': [None, 'balanced_subsample']
             }

# Búsqueda por grid search con validación cruzada
# ==============================================================================
grid = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=123),
    param_grid=param_grid,
    scoring='f1',
    n_jobs=multiprocessing.cpu_count() - 1,
    cv=5,
    refit=True,
    verbose=0,
    return_train_score=True
)

grid.fit(X=X_train, y=y_train)

modelo_final = grid.best_estimator_      

        
index += ['Decision Tree']
cv_result = cross_validate(modelo_final, X_test, y_test, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())
scores["F1-Score"].append(cv_result["test_f1"].mean())
scores["AUROC"].append(cv_result["test_roc_auc"].mean())
scores["Precision"].append(cv_result["test_precision"].mean())
scores["Recall"].append(cv_result["test_recall"].mean())

df_scores = pd.DataFrame(scores, index=index)
df_scores

In [None]:
print('TRAIN:')
print()
y_new_pred = modelo_final.predict(X_train)
LT.show_results_1(y_train, y_new_pred)
print()
metrics.plot_roc_curve(modelo_final, X_train, y_train)  
plt.show() 
print("=================================")
print()
print('TEST:')
y_new_pred = modelo_final.predict(X_test)
LT.show_results_1(y_test, y_new_pred)
print()
metrics.plot_roc_curve(modelo_final, X_test, y_test)  
plt.show() 
print("=================================")

In [None]:
plt.figure(figsize=(6,8))
sns.set_style("white")

# view the feature scores

feature_scores = pd.Series(modelo_final.feature_importances_,
                           index=X_train.columns).sort_values(ascending=False)

# Creating a seaborn bar plot

f, ax = plt.subplots(figsize=(10, 16))
ax = sns.barplot(x=feature_scores, y=feature_scores.index)
# ax.set_title("Importancia de los Predictores")
ax.set_yticklabels(feature_scores.index)
ax.set_xlabel("Predictors relevance", fontsize = 6)
ax.set_ylabel("Predictors", fontsize = 6)

# plt.savefig("Predictors_importance.png", bbox_inches='tight',dpi=300)
plt.show()

In [None]:
from sklearn.inspection import permutation_importance


importancia = permutation_importance(
                estimator    = modelo_final,
                X            = X_train,
                y            = y_train,
                n_repeats    = 5,
#                 scoring      = 'neg_root_mean_squared_error',
                n_jobs       = multiprocessing.cpu_count() - 1,
                random_state = 123
             )

# Se almacenan los resultados (media y desviación) en un dataframe
df_importancia = pd.DataFrame(
                    {k: importancia[k] for k in ['importances_mean', 'importances_std']}
                 )
df_importancia['feature'] = X_train.columns
print(df_importancia.sort_values('importances_mean', ascending=False))

In [None]:
df_importancia.to_excel('importancia.xlsx')

In [None]:
# Gráfico
fig, ax = plt.subplots(figsize=(5, 14))
df_importancia = df_importancia.sort_values('importances_mean', ascending=True)
ax.barh(
    df_importancia['feature'],
    df_importancia['importances_mean'],
    xerr=df_importancia['importances_std'],
    align='center',
    alpha=0
)
ax.plot(
    df_importancia['importances_mean'],
    df_importancia['feature'],
    marker="D",
    alpha=0.8,
    color="r"
)

plt.axvline(x=0, color='k', linestyle='--')
ax.set_ylabel('Importance of predictors')
ax.set_xlabel('Increase in error after permutation');

In [None]:
df_importancia['importances_mean'] = np.abs(df_importancia['importances_mean'])

# Gráfico
fig, ax = plt.subplots(figsize=(5, 14))
df_importancia = df_importancia.sort_values('importances_mean', ascending=True)
ax.barh(
    df_importancia['feature'],
    df_importancia['importances_mean'],
    xerr=df_importancia['importances_std'],
    align='center',
    alpha=0
)
ax.plot(
    df_importancia['importances_mean'],
    df_importancia['feature'],
    marker="D",
    alpha=0.8,
    color="r"
)

plt.axvline(x=0, color='k', linestyle='--')
ax.set_ylabel('Importance of predictors')
ax.set_xlabel('Increase in error after permutation');

In [None]:
import shap

In [None]:
# Fits the explainer
explainer = shap.Explainer(modelo_final.predict, X_test)
# Calculates the SHAP values - It takes some time
shap_values = explainer(X_test)

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.summary_plot(shap_values)


In [None]:
# or 
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.waterfall(shap_values[0])

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
plt.style.use('seaborn-whitegrid')

y_prob_tree = modelo_final.predict_proba(X_test)[:, 1]

fpr_tree, tpr_tree, thresholds_tre = roc_curve(y_test, y_prob_tree)

plt.figure(figsize=(8, 6))
plt.title('Comparación Modelos mediante ROC', )

plt.plot(fpr_tree, tpr_tree,  label='Random Forest')

plt.plot([0, 1], [0, 1], 'g--')
plt.plot([0, 1], [0, 0], 'k')
plt.plot([1, 1], [0, 1], 'k')
plt.axis([-0.05, 1.05, -0.05, 1.05])

# plt.axis('equal')
plt.grid(True)  # para mostrar las líneas del grid

plt.xlabel('False Positive Rate (Fall-Out)', fontsize=14)  # Not shown
plt.ylabel('True Positive Rate (Recall)', fontsize=14)    # Not shown

plt.text(x=0.6, y=0.05,
         s=f'AUC RF: {round(roc_auc_score(y_test, y_prob_tree),2)}', fontsize=12)


plt.legend()
plt.show()

# Eliminación Recursiva de Predictores

In [None]:
predictores_definitivos = [x for x in feature_scores[feature_scores > 0.01].index]

In [None]:
X_train = X_train[predictores_definitivos]
X_test = X_test[predictores_definitivos]

In [None]:
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

ROC_AUC_train_log = []
ROC_AUC_test_log = []
PARAMETROS_log = []

for i in tqdm(range(0,8)):
    
        X_train = X_train[predictores_definitivos]
        X_test = X_test[predictores_definitivos]
        
        # Grid de hiperparámetros evaluados
        # ==============================================================================
        param_grid = {
                  'n_estimators': [100, 150],
                  'max_depth': [None, 5, 7],
                  'criterion': ['gini', 'entropy'],
                  'class_weight': ['balanced_subsample']
                  }
        
        grid = GridSearchCV(
            estimator=RandomForestClassifier(random_state=123),
            param_grid=param_grid,
            scoring='precision',
            n_jobs=multiprocessing.cpu_count() - 1,
            cv=5,
            refit=True,
            verbose=0,
            return_train_score=True
        )

        grid.fit(X=X_train, y=y_train)

        modelo_final = grid.best_estimator_
        
        y_pred = modelo_final.predict_proba(X_train)[:,1]
        y_pred_test = modelo_final.predict_proba(X_test)[:,1]
        parametros = grid.best_estimator_
        
        ROC_AUC_train_log.append(roc_auc_score(y_train, y_pred))
        ROC_AUC_test_log.append(roc_auc_score(y_test, y_pred_test))
        PARAMETROS_log.append(parametros)
        
        predictores_definitivos.pop()


In [None]:
plt.style.use('seaborn-whitegrid')


plt.figure(figsize=(8, 6))


plt.title('Eliminación de Predictores', fontsize=13, fontweight='bold')

plt.plot([0, 1, 2, 3, 4, 5, 6, 7], ROC_AUC_train_log,'-ok',  label='ROC AUC Train Log')
plt.plot([0, 1, 2, 3, 4, 5, 6, 7], ROC_AUC_test_log,color = 'red'  ,marker='o',label='ROC AUC Test Log')

# plt.plot(fpr_tree, tpr_tree,  label='Random Forest')

# plt.plot([0, 1], [0, 1], 'g--')
# plt.plot([0, 1], [0, 0], 'k')
# plt.plot([1, 1], [0, 1], 'k')

plt.axis([-0.05, 7.05, 0.7, 0.83])

# plt.axis('equal')
plt.grid(True)  # para mostrar las líneas del grid

plt.xlabel('Predictores Eliminados', fontsize=13, labelpad=20)  # Not shown
plt.ylabel('ROC AUC', fontsize=13, labelpad=20)    # Not shown

plt.text(x=2, y=0.72,
         s=f'AUC INICIAL: {round(ROC_AUC_train_log[0],3)}', fontsize=12)
plt.text(x=2, y=0.715,
         s=f'AUC FINAL: {round(ROC_AUC_train_log[-1],3)}', fontsize=12)


xticks = np.arange(len(ROC_AUC_train_log))

Nombres_definitivos_voltear = [
    'CLASE_SEMANTICA_DEL_VB',
    'DISTANCIA_DE_REFERENTE',
    'ESPECIFICIDAD',
    'LONGUITUD_DE_LA_FORMAL_VERBAL',
    'TIPO_DE_CLAUSULA',
    'TIPO_DE_DISCURSO',
    'TIEMPO_VERBAL',
    'SIN_ELIMINAR'
]

Nombres_definitivos = Nombres_definitivos_voltear[::-1]

plt.xticks(xticks, Nombres_definitivos, rotation=70, fontsize=10)

plt.legend()
plt.show()

# MODELO FINAL

In [None]:
predictores_definitivos = [x for x in feature_scores[feature_scores > 0.01].index]

X_train = X_train[predictores_definitivos]
X_test = X_test[predictores_definitivos]

In [None]:
nombre_modelo = 'Random_forest_gscv_definitivo_v1'

import os
import pickle

# Si no se han descargado todavía los datos, los descarga y almacena en el archivo.
if not os.path.isfile(os.getcwd() + '\\'+nombre_modelo+'.pkl'): 

    # Grid de hiperparámetros evaluados
    # ==============================================================================
    param_grid = {
                  'n_estimators': [100, 150, 200],
                  'max_depth': [None, 5, 7],
                  'criterion': ['gini', 'entropy'],
                  'class_weight': [None, 'balanced_subsample']
                  }

    # Búsqueda por grid search con validación cruzada
    # ==============================================================================
    grid = GridSearchCV(
        estimator=RandomForestClassifier(random_state=123),
        param_grid=param_grid,
        scoring='precision',
        n_jobs=multiprocessing.cpu_count() - 1,
        cv=5,
        refit=True,
        verbose=0,
        return_train_score=True
    )

    grid.fit(X=X_train, y=y_train)

    modelo_final = grid.best_estimator_
    
    with open(os.getcwd() + '\\'+nombre_modelo+'.pkl',"wb") as model_data_file:
        pickle.dump(modelo_final, model_data_file)


# Si se habían descargado anteriormente, y el archivo está disponible, los lee. 
else:
    with open(os.getcwd() + '\\'+nombre_modelo+'.pkl','rb') as model_data_file:
        modelo_final = pickle.load(model_data_file)        

        
index += ['Random Forest Definitivo']
cv_result = cross_validate(modelo_final, X_test, y_test, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())
scores["F1-Score"].append(cv_result["test_f1"].mean())
scores["AUROC"].append(cv_result["test_roc_auc"].mean())
scores["Precision"].append(cv_result["test_precision"].mean())
scores["Recall"].append(cv_result["test_recall"].mean())

df_scores = pd.DataFrame(scores, index=index)
df_scores

In [None]:
print('TRAIN:')
print()
y_new_pred = modelo_final.predict(X_train)
LT.show_results_1(y_train, y_new_pred)
print()
metrics.plot_roc_curve(modelo_final, X_train, y_train)  
plt.show() 
print("=================================")
print()
print('TEST:')
y_new_pred = modelo_final.predict(X_test)
LT.show_results_1(y_test, y_new_pred)
print()
metrics.plot_roc_curve(modelo_final, X_test, y_test)  
plt.show() 
print("=================================")

In [None]:
feature_scores