In [1]:
import time
import joblib
import pandas as pd
import mlflow
import mlflow.sklearn
from classification_model.preprocessing import ColumnSelector, ConvertDtypes, \
                                               GetDummies, GetDataFrame, Mapper
from classification_model.config import NUMERICAL_FEATURES, CATEGORICAL_FEATURES, \
                                        FEATURES, TARGET, MAP
from classification_model.evaluation import confusion_matrix, metrics_summary
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score, recall_score, f1_score
from xgboost import XGBClassifier, plot_importance
import warnings
warnings.filterwarnings('ignore')

experiment_name = 'churn'
mlflow.set_experiment(experiment_name)

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

In [2]:
dtypes = {'state': 'category', 'area code': int, 'international plan': 'category',
          'voice mail plan': 'category'}
data = pd.read_csv('Data/train.csv', sep=';', dtype=dtypes)
data.head(3).append(data.tail(3))

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,number customer service calls,target
0,5,54.0,510,1724,0,1,39.0,117.6,82.0,19.99,159.2,60.0,13.53,236.4,113.0,10.64,11.3,10.0,3.05,2.0,0
1,41,91.0,415,1590,0,0,0.0,153.0,123.0,26.01,141.1,127.0,11.99,171.5,76.0,7.72,10.3,15.0,2.78,1.0,1
2,18,137.0,408,3793,0,0,0.0,151.8,90.0,25.81,229.6,68.0,19.52,171.8,122.0,7.73,10.8,5.0,2.92,2.0,0
3497,24,138.0,415,4239,0,1,29.0,190.1,87.0,32.32,223.2,123.0,18.97,256.2,130.0,11.53,14.2,6.0,3.83,0.0,0
3498,15,117.0,415,1492,1,1,22.0,196.0,82.0,33.32,322.7,82.0,27.43,225.6,120.0,10.15,3.7,5.0,1.0,1.0,0
3499,13,123.0,415,4959,0,0,0.0,132.2,122.0,22.47,169.9,101.0,14.44,150.1,123.0,6.75,12.9,11.0,3.48,3.0,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 21 columns):
state                            3500 non-null category
account length                   3500 non-null float64
area code                        3500 non-null int64
phone number                     3500 non-null int64
international plan               3500 non-null category
voice mail plan                  3500 non-null category
number vmail messages            3500 non-null float64
total day minutes                3500 non-null float64
total day calls                  3500 non-null float64
total day charge                 3500 non-null float64
total eve minutes                3500 non-null float64
total eve calls                  3500 non-null float64
total eve charge                 3500 non-null float64
total night minutes              3500 non-null float64
total night calls                3500 non-null float64
total night charge               3500 non-null float64
total intl min

In [4]:
global_transformation = Pipeline([('selector', ColumnSelector(columns=NUMERICAL_FEATURES + CATEGORICAL_FEATURES)),
                                  ('map_1', Mapper(column='international plan', map=MAP)),
                                  ('map_2', Mapper(column='voice mail plan', map=MAP)),
                                  ('convert_dtypes', ConvertDtypes(numerical=NUMERICAL_FEATURES,
                                                                   categorical=CATEGORICAL_FEATURES))])

numerical_transformations = Pipeline([('selector', ColumnSelector(columns=NUMERICAL_FEATURES)),
                                      ('standar', StandardScaler()),
                                      ('dataframe', GetDataFrame(columns=NUMERICAL_FEATURES))])

categorical_transformations = Pipeline([('selector', ColumnSelector(columns=CATEGORICAL_FEATURES)),
                                        ('ohe', GetDummies(columns=CATEGORICAL_FEATURES))])

In [5]:
preprocessing = Pipeline([('global', global_transformation),
                          ('features', FeatureUnion([
                              ('numeric', numerical_transformations),
                              ('category', categorical_transformations)
                          ])),
                          ('dataframe', GetDataFrame(columns=FEATURES))])

## Construcción de los modelos

In [6]:
label = data.pop(TARGET)

X = data
y = label

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=label)

print('El conjunto de entranamiento tiene {} observaciones, y {} variables'.format(X_train.shape[0],
                                                                                            X_train.shape[1]))
print('El conjunto de validación tiene {} observaciones, y {} variables'.format(X_valid.shape[0],
                                                                            X_valid.shape[1]))

El conjunto de entranamiento tiene 2800 observaciones, y 20 variables
El conjunto de validación tiene 700 observaciones, y 20 variables


### Regresión Logística

In [7]:
params = {
    'penalty': 'l2',
    'random_state': 42
}

with mlflow.start_run():
    # fit
    start_fit = time.time()
    lr = Pipeline([('preprocessing', preprocessing),
                   ('estimator', LogisticRegression(**params))]).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = lr.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = lr.score(X_valid, y_valid)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('random_state', params['random_state'])
    mlflow.log_param('penalty', params['penalty'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(lr, 'logistic_regression')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.5413536361955663
La exactitud es: 0.8571428571428571
La precisión es: 0.47619047619047616
El recall es: 0.10101010101010101
El puntaje F1 es: 0.16666666666666666 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,590,11
1,89,10


### Random Forest

In [8]:
params = {
    'n_estimators': 100,
    'max_depth': 5,
    'oob_score': True,
    'random_state': 42
}

with mlflow.start_run():
    # fit
    start_fit = time.time()
    rf = Pipeline([('preprocessing', preprocessing),
                   ('estimator', RandomForestClassifier(**params))]).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = rf.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = rf.score(X_valid, y_valid)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('random_state', params['random_state'])
    mlflow.log_param('oob_score', params['oob_score'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(rf, 'random_forest')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.5791441872972655
La exactitud es: 0.8785714285714286
La precisión es: 0.8888888888888888
El recall es: 0.16161616161616163
El puntaje F1 es: 0.27350427350427353 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,599,2
1,83,16


### Gradient Boosting

In [9]:
params = {
    'n_estimators': 100,
    'learning_rate': 0.09,
    'max_depth': 5,
    'subsample': 0.7,
    'random_state': 42
}

with mlflow.start_run():
    # fit
    start_fit = time.time()
    gb = Pipeline([('preprocessing', preprocessing),
                   ('estimator', GradientBoostingClassifier(**params))]).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = gb.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = gb.score(X_valid, y_valid)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    mlflow.log_param('random_state', params['random_state'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(gb, 'gradient_boosting')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.7543067950721861
La exactitud es: 0.9185714285714286
La precisión es: 0.8387096774193549
El recall es: 0.5252525252525253
El puntaje F1 es: 0.6459627329192548 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,591,10
1,47,52


### XGBoost

In [10]:
params = {
    'n_estimators': 100,
    'max_depth': 5,
    'subsample': 0.7,
    'learning_rate': 0.09,
    'random_state': 42
}

with mlflow.start_run():
    # fit
    start_fit = time.time()
    xgb = Pipeline([('preprocessing', preprocessing),
                    ('estimator', XGBClassifier(**params))]).fit(X_train, y_train)
    end_fit = time.time() - start_fit
    
    # predict
    start_predict = time.time()
    y_pred = xgb.predict(X_valid)
    end_predict = time.time() - start_predict
    
    # metrics
    accuracy = xgb.score(X_valid, y_valid)
    precision = precision_score(y_true=y_valid, y_pred=y_pred)
    recall = recall_score(y_true=y_valid, y_pred=y_pred)
    f1 = f1_score(y_true=y_valid, y_pred=y_pred)
    
    # save params
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    mlflow.log_param('random_state', params['random_state'])
    
    # save metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('duration_training', end_fit)
    mlflow.log_metric('duration_prediction', end_predict)
    
    # model
    mlflow.sklearn.log_model(xgb, 'xgboost')

metrics_summary(y_valid, y_pred)
confusion_matrix(y_valid, y_pred)

El área bajo la curva ROC es: 0.7660716986840115
La exactitud es: 0.9242857142857143
La precisión es: 0.8709677419354839
El recall es: 0.5454545454545454
El puntaje F1 es: 0.670807453416149 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,593,8
1,45,54
