In [1]:
import time
import joblib
import numpy as np
import pandas as pd
import mlflow.sklearn
from classification_model.preprocessing import ColumnSelector, ConvertDtypes, \
                                               GetDummies, GetDataFrame, Mapper
from classification_model.config import NUMERICAL_FEATURES, CATEGORICAL_FEATURES, \
                                        FEATURES, TARGET, MAP, SEED
from classification_model.evaluation import confusion_matrix, metrics_summary
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, Trials
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore')

experiment_name = 'churn'
mlflow.set_experiment(experiment_name)

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

  from collections import Mapping
  from collections import Mapping, Set, Iterable


In [2]:
dtypes = {'state': 'category', 'area code': int, 'international plan': 'category',
          'voice mail plan': 'category'}
data = pd.read_csv('data/train.csv', sep=';', dtype=dtypes)
data.head(3).append(data.tail(3))

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,number customer service calls,target
0,12,102.0,510,234,0,0,0.0,171.5,77.0,29.16,261.0,107.0,22.19,328.8,72.0,14.8,10.2,1.0,2.75,0.0,0
1,25,70.0,510,2558,0,0,0.0,147.1,105.0,25.01,200.0,135.0,17.0,234.9,65.0,10.57,12.5,9.0,3.38,3.0,0
2,7,88.0,408,753,0,0,0.0,204.8,100.0,34.82,193.9,98.0,16.48,190.9,62.0,8.59,2.0,11.0,0.54,1.0,0
3997,24,138.0,415,4239,0,1,29.0,190.1,87.0,32.32,223.2,123.0,18.97,256.2,130.0,11.53,14.2,6.0,3.83,0.0,0
3998,15,117.0,415,1492,1,1,22.0,196.0,82.0,33.32,322.7,82.0,27.43,225.6,120.0,10.15,3.7,5.0,1.0,1.0,0
3999,1,85.0,415,2163,0,0,0.0,210.3,66.0,35.75,195.8,76.0,16.64,221.6,82.0,9.97,11.2,7.0,3.02,1.0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 21 columns):
state                            4000 non-null category
account length                   4000 non-null float64
area code                        4000 non-null int64
phone number                     4000 non-null int64
international plan               4000 non-null category
voice mail plan                  4000 non-null category
number vmail messages            4000 non-null float64
total day minutes                4000 non-null float64
total day calls                  4000 non-null float64
total day charge                 4000 non-null float64
total eve minutes                4000 non-null float64
total eve calls                  4000 non-null float64
total eve charge                 4000 non-null float64
total night minutes              4000 non-null float64
total night calls                4000 non-null float64
total night charge               4000 non-null float64
total intl min

In [4]:
global_transformation = Pipeline([('selector', ColumnSelector(columns=NUMERICAL_FEATURES + CATEGORICAL_FEATURES)),
                                  ('map_1', Mapper(column='international plan', map=MAP)),
                                  ('map_2', Mapper(column='voice mail plan', map=MAP)),
                                  ('convert_dtypes', ConvertDtypes(numerical=NUMERICAL_FEATURES,
                                                                   categorical=CATEGORICAL_FEATURES))])

numerical_transformations = Pipeline([('selector', ColumnSelector(columns=NUMERICAL_FEATURES)),
                                      ('standar', StandardScaler()),
                                      ('dataframe', GetDataFrame(columns=NUMERICAL_FEATURES))])

categorical_transformations = Pipeline([('selector', ColumnSelector(columns=CATEGORICAL_FEATURES)),
                                        ('ohe', GetDummies(columns=CATEGORICAL_FEATURES))])

In [5]:
preprocessing = Pipeline([('global', global_transformation),
                          ('features', FeatureUnion([
                              ('numeric', numerical_transformations),
                              ('category', categorical_transformations)
                          ])),
                          ('dataframe', GetDataFrame(columns=FEATURES))])

## Construcción de los modelos

In [6]:
label = data.pop(TARGET)

X = data
y = label

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      random_state=SEED, stratify=label)

print('El conjunto de entranamiento tiene {} observaciones, y {} variables'.format(X_train.shape[0],
                                                                                            X_train.shape[1]))
print('El conjunto de validación tiene {} observaciones, y {} variables'.format(X_valid.shape[0],
                                                                            X_valid.shape[1]))

El conjunto de entranamiento tiene 3200 observaciones, y 20 variables
El conjunto de validación tiene 800 observaciones, y 20 variables


### Regresión Logística

In [7]:
params = {
    'penalty': 'l2',
    'random_state': SEED
}

with mlflow.start_run(run_name='logistic regression'):
    # fit
    start_fit = time.time()
    lr = Pipeline([('preprocessing', preprocessing),
                   ('estimator', LogisticRegression(**params))]).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = lr.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = lr.score(X_valid, y_valid)
    roc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('penalty', params['penalty'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(lr, 'logistic_regression')
    
    
joblib.dump(lr, 'modelos/logistic_regression.pkl')    

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.5406087774213909
La exactitud es: 0.865
La precisión es: 0.6666666666666666
El recall es: 0.08849557522123894
El puntaje F1 es: 0.15625 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.99,0.01
1,0.91,0.09


### Random Forest

In [8]:
params = {
    'n_estimators': 100,
    'max_depth': 5,
    'oob_score': True,
    'random_state': SEED
}

with mlflow.start_run(run_name='random forest'):
    # fit
    start_fit = time.time()
    rf = Pipeline([('preprocessing', preprocessing),
                   ('estimator', RandomForestClassifier(**params))]).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = rf.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = rf.score(X_valid, y_valid)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('oob_score', params['oob_score'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(rf, 'random_forest')
    
joblib.dump(rf, 'modelos/random_forest.pkl')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.5663716814159292
La exactitud es: 0.8775
La precisión es: 1.0
El recall es: 0.13274336283185842
El puntaje F1 es: 0.23437500000000003 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,0.0
1,0.87,0.13


### Gradient Boosting

In [9]:
params = {
    'n_estimators': 100,
    'learning_rate': 0.09,
    'max_depth': 5,
    'subsample': 0.7,
    'random_state': SEED
}

with mlflow.start_run(run_name='gradient boosting'):
    # fit
    start_fit = time.time()
    gb = Pipeline([('preprocessing', preprocessing),
                   ('estimator', GradientBoostingClassifier(**params))]).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = gb.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = gb.score(X_valid, y_valid)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(gb, 'gradient_boosting')

joblib.dump(gb, 'modelos/gradient_boosting.pkl')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.7692416689209207
La exactitud es: 0.9275
La precisión es: 0.8985507246376812
El recall es: 0.5486725663716814
El puntaje F1 es: 0.6813186813186812 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.99,0.01
1,0.45,0.55


### XGBoost

In [10]:
params = {
    'n_estimators': 100,
    'max_depth': 5,
    'subsample': 0.7,
    'learning_rate': 0.09,
    'random_state': SEED
}

with mlflow.start_run(run_name='xgboosting'):
    # fit
    start_fit = time.time()
    xgb = Pipeline([('preprocessing', preprocessing),
                    ('estimator', XGBClassifier(**params))]).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = xgb.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = xgb.score(X_valid, y_valid)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(xgb, 'xgboost')

joblib.dump(xgb, 'modelos/xgboost.pkl')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.7706972729966122
La exactitud es: 0.93
La precisión es: 0.9253731343283582
El recall es: 0.5486725663716814
El puntaje F1 es: 0.6888888888888888 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.99,0.01
1,0.45,0.55


## Estimación del modelo usando SMOTE

In [11]:
preprocessing = preprocessing.fit(X_train)
joblib.dump(preprocessing, 'modelos/preprocessing.pkl')

X_train = preprocessing.transform(X_train)
X_valid = preprocessing.transform(X_valid)

smote = SMOTE(random_state=SEED)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [12]:
## Regresión Logística

In [13]:
params = {
    'random_state': SEED,
    'penalty': 'l2'
}

with mlflow.start_run(run_name='logistic regression smote'):
    # fit
    start_fit = time.time()
    lr = LogisticRegression(**params).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = lr.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = lr.score(X_valid, y_valid)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('penalty', params['penalty'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('roc_auc', roc_auc)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(lr, 'logistic_regression_smote')
    
joblib.dump(lr, 'modelos/logistic_regression.pkl')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.700184204763561
La exactitud es: 0.72
La precisión es: 0.2889733840304182
El recall es: 0.672566371681416
El puntaje F1 es: 0.40425531914893614 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.73,0.27
1,0.33,0.67


### Random Forest

In [14]:
params = {
    'n_estimators': 100,
    'max_depth': 5,
    'oob_score': True,
    'random_state': SEED
}

with mlflow.start_run(run_name='random forest smote'):
    # fit
    start_fit = time.time()
    rf = RandomForestClassifier(**params).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = rf.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = rf.score(X_valid, y_valid)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('oob_score', params['oob_score'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('roc_auc', roc_auc)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(rf, 'random_forest_smote')

joblib.dump(rf, 'modelos/random_forest_smote.pkl')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.7313379964189562
La exactitud es: 0.86875
La precisión es: 0.5350877192982456
El recall es: 0.5398230088495575
El puntaje F1 es: 0.5374449339207048 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.92,0.08
1,0.46,0.54


### Gradient Boosting

In [15]:
params = {
    'n_estimators': 100,
    'learning_rate': 0.09,
    'max_depth': 5,
    'subsample': 0.7,
    'random_state': SEED
}

with mlflow.start_run(run_name='gradient boosting smote'):
    # fit
    start_fit = time.time()
    gb = GradientBoostingClassifier(**params).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = gb.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = gb.score(X_valid, y_valid)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('roc_auc', roc_auc)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(gb, 'gradient_boosting_smote')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.772268810140279
La exactitud es: 0.92
La precisión es: 0.810126582278481
El recall es: 0.5663716814159292
El puntaje F1 es: 0.6666666666666667 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.98,0.02
1,0.43,0.57


### XGBoost

In [16]:
params = {
    'n_estimators': 100,
    'max_depth': 5,
    'subsample': 0.8,
    'learning_rate': 0.09,
    'random_state': SEED
}

with mlflow.start_run(run_name='xgboosting smote'):
    # fit
    start_fit = time.time()
    xgb = XGBClassifier(**params).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = xgb.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = xgb.score(X_valid, y_valid)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('roc_auc', roc_auc)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(xgb, 'xgboost_smote')

joblib.dump(xgb, 'modelos/xgboost_smote.pkl')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.7803905656245572
La exactitud es: 0.92125
La precisión es: 0.8048780487804879
El recall es: 0.584070796460177
El puntaje F1 es: 0.676923076923077 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.98,0.02
1,0.42,0.58


### Ajuste de hiperparametros

In [20]:
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 50),
    'max_depth': hp.quniform('max_depth', 3, 11, 2),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.9),
    'subsample': hp.quniform('subsample', 0.25, 1.0, 0.25),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.25, 1.0, 0.25)
}

def objective(params):
    params = {'n_estimators': int(params['n_estimators']),
              'max_depth': int(params['max_depth']),
              'learning_rate': params['learning_rate'],
              'subsample': params['subsample'],
              'colsample_bytree': params['colsample_bytree']}
    xgb = XGBClassifier(**params)
    best_score = cross_val_score(xgb, X_train, y_train, scoring='roc_auc',
                                 cv=5, n_jobs=-1).mean()
    loss = 1 - best_score
    return loss

best = fmin(fn=objective, space=space, max_evals=20,
            rstate=np.random.RandomState(SEED), algo=tpe.suggest)
best


100%|██████████| 20/20 [02:43<00:00,  8.18s/it, best loss: 0.01054958193720168] 


{'colsample_bytree': 0.5,
 'learning_rate': 0.28455259632724933,
 'max_depth': 10.0,
 'n_estimators': 100.0,
 'subsample': 1.0}

In [23]:
params = {
    'colsample_bytree': 0.5,
    'learning_rate': 0.28455259632724933,
    'max_depth': 10,
    'n_estimators': 100,
    'subsample': 1.0
}

with mlflow.start_run(run_name='xgboosting smote cv'):
    # fit
    start_fit = time.time()
    xgb = XGBClassifier(**params).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = xgb.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = xgb.score(X_valid, y_valid)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    mlflow.log_param('colsample_bytree', params['colsample_bytree'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('roc_auc', roc_auc)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(xgb, 'xgboost_smote cv')

joblib.dump(xgb, 'modelos/xgboost_smote_cv.pkl')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)


El área bajo la curva ROC es: 0.7847573778516315
La exactitud es: 0.92875
La precisión es: 0.868421052631579
El recall es: 0.584070796460177
El puntaje F1 es: 0.6984126984126985 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.99,0.01
1,0.42,0.58
