In [9]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from utilities.evaluation import ModelEvaluation
from sklearn.tree import DecisionTreeClassifier
from mlxtend.evaluate import bias_variance_decomp
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
churn = pd.read_csv('https://raw.githubusercontent.com/stivenlopezg/Diplomado-Python/master/data/Churn_Modelling.csv',
                    index_col=0)

for column in ['HasCrCard', 'IsActiveMember']:
    churn[column] = churn[column].map({1: 'Yes', 0: 'No'})

churn.sample(n=4)

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8660,15770631,Sutherland,730,Spain,Male,25,5,167385.81,1,Yes,Yes,56307.51,0
4403,15720637,Bell,710,Germany,Female,46,10,120530.34,1,Yes,No,166586.99,1
7005,15571121,Kodilinyechukwu,670,France,Female,50,8,138340.06,1,No,Yes,3159.15,0
5945,15715709,Shih,696,Germany,Male,43,4,114091.38,1,No,Yes,159888.1,0


In [3]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  object 
 10  IsActiveMember   10000 non-null  object 
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 1.1+ MB


In [5]:
churn['Exited'].value_counts(normalize=True)

0    0.7963
1    0.2037
Name: Exited, dtype: float64

In [6]:
exited = churn.pop('Exited')

train_data, test_data, train_label, test_label = train_test_split(churn, exited,
                                                                  test_size=0.3,
                                                                  stratify=exited, random_state=42)

print(f'El set de entrenamiento tiene {train_data.shape[0]} observaciones, y {train_data.shape[1]} variables.')
print(f'El set de prueba tiene {test_data.shape[0]} observaciones, y {test_data.shape[1]} variables.')

El set de entrenamiento tiene 7000 observaciones, y 12 variables.
El set de prueba tiene 3000 observaciones, y 12 variables.


In [7]:
numerical_features = ['CreditScore', 'Age', 'Tenure',
                      'Balance', 'NumOfProducts', 'EstimatedSalary']

categorical_features = ['Gender', 'Geography', 'HasCrCard', 'IsActiveMember']

In [10]:
numeric_preprocessing = ColumnTransformer(transformers=[('numeric', StandardScaler(), numerical_features)],
                                          remainder='drop')

X_train = numeric_preprocessing.fit_transform(train_data)
X_test = numeric_preprocessing.transform(test_data)

print(X_train[0])
print(X_test[0])

[-0.57558225 -1.79667873 -0.34947397  0.31290573 -0.91248301  1.36251232]
[ 1.44446154 -0.18899111  0.34216239 -1.22357411  0.79949262  0.34845313]


In [11]:
lr = LogisticRegression(random_state=42)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(estimator=lr,
                                                            X_train=X_train,
                                                            y_train=train_label.values,
                                                            X_test=X_test,
                                                            y_test=test_label.values,
                                                            loss='0-1_loss',
                                                            random_seed=123)

print(f'El promedio de la perdida esperada es: {avg_expected_loss}')
print(f'El promedio del sesgo: {avg_bias}')
print(f'E promedio de la varianza: {avg_var}')

El promedio de la perdida esperada es: 0.21257666666666666
El promedio del sesgo: 0.213
E promedio de la varianza: 0.00556


In [12]:
lr.fit(X_train, train_label)

evaluation_lr = ModelEvaluation(observed=test_label, predicted=lr.predict(X_test))
evaluation_lr.calculate_metrics()
evaluation_lr.print_metrics()
evaluation_lr.confusion_matrix(normalize=False)

El AUC es: 0.51
El accuracy es: 0.79
La precision es: 0.71
El recall es: 0.79
El F1 Score es: 0.72 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2330,59
1,581,30


In [13]:
preprocessing = ColumnTransformer(transformers=[
    ('numeric', StandardScaler(), numerical_features),
    ('categoric', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)],
remainder='drop')

X_train = preprocessing.fit_transform(train_data)
X_test = preprocessing.transform(test_data)

print(X_train[0])
print(X_test[0])

[-0.57558225 -1.79667873 -0.34947397  0.31290573 -0.91248301  1.36251232
  0.          1.          1.          0.          0.          0.
  1.          1.          0.        ]
[ 1.44446154 -0.18899111  0.34216239 -1.22357411  0.79949262  0.34845313
  0.          1.          0.          0.          1.          0.
  1.          0.          1.        ]


In [14]:
lr = LogisticRegression(random_state=42)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(estimator=lr,
                                                            X_train=X_train,
                                                            y_train=train_label.values,
                                                            X_test=X_test,
                                                            y_test=test_label.values,
                                                            loss='0-1_loss',
                                                            random_seed=123)

print(f'El promedio de la perdida esperada es: {avg_expected_loss}')
print(f'El promedio del sesgo: {avg_bias}')
print(f'E promedio de la varianza: {avg_var}')

El promedio de la perdida esperada es: 0.18679833333333334
El promedio del sesgo: 0.187
E promedio de la varianza: 0.010598333333333335


In [15]:
lr.fit(X_train, train_label)

evaluation_lr = ModelEvaluation(observed=test_label, predicted=lr.predict(X_test))
evaluation_lr.calculate_metrics()
evaluation_lr.print_metrics()
evaluation_lr.confusion_matrix(normalize=False)

El AUC es: 0.58
El accuracy es: 0.81
La precision es: 0.79
El recall es: 0.81
El F1 Score es: 0.77 



Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2318,71
1,491,120
