In [130]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import accuracy_score, f1_score
from feature_engine.encoding import OrdinalEncoder, OneHotEncoder
from feature_engine.transformation import YeoJohnsonTransformer
from sklearn.linear_model import LogisticRegression

In [131]:
pd.set_option("display.max_columns", 25)

In [132]:
data = pd.read_csv("../data/input_data/telco_customer_churn_1.csv")

In [133]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [134]:
data.isnull().count()

customerID          6000
gender              6000
SeniorCitizen       6000
Partner             6000
Dependents          6000
tenure              6000
PhoneService        6000
MultipleLines       6000
InternetService     6000
OnlineSecurity      6000
OnlineBackup        6000
DeviceProtection    6000
TechSupport         6000
StreamingTV         6000
StreamingMovies     6000
Contract            6000
PaperlessBilling    6000
PaymentMethod       6000
MonthlyCharges      6000
TotalCharges        6000
Churn               6000
dtype: int64

In [135]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        6000 non-null   object 
 1   gender            6000 non-null   object 
 2   SeniorCitizen     6000 non-null   int64  
 3   Partner           6000 non-null   object 
 4   Dependents        6000 non-null   object 
 5   tenure            6000 non-null   int64  
 6   PhoneService      6000 non-null   object 
 7   MultipleLines     6000 non-null   object 
 8   InternetService   6000 non-null   object 
 9   OnlineSecurity    6000 non-null   object 
 10  OnlineBackup      6000 non-null   object 
 11  DeviceProtection  6000 non-null   object 
 12  TechSupport       6000 non-null   object 
 13  StreamingTV       6000 non-null   object 
 14  StreamingMovies   6000 non-null   object 
 15  Contract          6000 non-null   object 
 16  PaperlessBilling  6000 non-null   object 


In [136]:
data['TotalCharges'].isnull().values.any()

False

In [137]:
nan_rows = data[data['TotalCharges'].isnull()]

In [138]:
data.isnull().T.any().T.sum()

0

In [139]:
data.isnull().any().any()

False

In [140]:
data.isna().values.any()

False

## Train - Test Split.

In [141]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['customerID', 'Churn'], axis=1),
                                data['Churn'], test_size=0.2, random_state=0)

## Categorical and Numerical Features

In [142]:
cat_vars = [var for var in X_train.columns if X_train[var].dtype == 'object']
num_vars = [var for var in X_train.columns if var not in cat_vars]

In [143]:
cat_vars

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'TotalCharges']

In [144]:
num_vars

['SeniorCitizen', 'tenure', 'MonthlyCharges']

## Categorical features

In [145]:
cat_vars_onehot = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
cat_vars_ordinal_arbitrary = ['MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies',
    'Contract', 'PaymentMethod']

In [146]:
X_train[cat_vars_onehot]

Unnamed: 0,gender,Partner,Dependents,PhoneService,PaperlessBilling
3381,Female,No,No,Yes,Yes
31,Male,Yes,No,Yes,Yes
1596,Female,Yes,No,Yes,Yes
1386,Male,No,No,Yes,No
4237,Male,No,No,Yes,Yes
...,...,...,...,...,...
4931,Male,Yes,No,Yes,Yes
3264,Female,No,Yes,Yes,Yes
1653,Male,Yes,No,Yes,No
2607,Male,No,No,Yes,Yes


In [147]:
ordinal_encoder_arbitrary = OrdinalEncoder(encoding_method='arbitrary', variables=cat_vars_ordinal_arbitrary)
ordinal_encoder_arbitrary.fit(X_train, y_train)

onehot_encoder = OneHotEncoder(variables=cat_vars_onehot)
onehot_encoder.fit(X_train)

In [148]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
3381,Female,0,No,No,41,Yes,No,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Bank transfer (automatic),79.85,3320.75
31,Male,1,Yes,No,2,Yes,No,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Credit card (automatic),95.50,181.65
1596,Female,0,Yes,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.35,120.25
1386,Male,0,No,No,64,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),86.55,5632.55
4237,Male,0,No,No,5,Yes,No,DSL,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,70.05,346.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4931,Male,0,Yes,No,15,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,103.45,1539.8
3264,Female,0,No,Yes,10,Yes,No,Fiber optic,No,No,Yes,Yes,Yes,No,Month-to-month,Yes,Electronic check,91.10,964.35
1653,Male,0,Yes,No,58,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.75,1185.95
2607,Male,1,No,No,1,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,69.75,69.75


In [149]:
X_train = ordinal_encoder_arbitrary.transform(X_train)
X_test =  ordinal_encoder_arbitrary.transform(X_test)

In [150]:
X_train = onehot_encoder.transform(X_train)
X_test  = onehot_encoder.transform(X_test)

In [151]:
X_train

Unnamed: 0,SeniorCitizen,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaymentMethod,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_Yes,PhoneService_No,PaperlessBilling_Yes,PaperlessBilling_No
3381,0,41,0,0,0,0,0,0,0,0,0,0,79.85,3320.75,1,0,1,0,1,0,1,0,1,0
31,1,2,0,1,1,0,0,1,0,0,1,1,95.50,181.65,0,1,0,1,1,0,1,0,1,0
1596,0,2,0,1,1,0,1,1,1,1,1,2,70.35,120.25,1,0,0,1,1,0,1,0,1,0
1386,0,64,1,0,1,1,0,0,0,0,2,1,86.55,5632.55,0,1,1,0,1,0,1,0,0,1
4237,0,5,0,0,1,0,0,1,0,0,1,2,70.05,346.4,0,1,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4931,0,15,1,1,1,0,0,0,0,0,1,2,103.45,1539.8,0,1,0,1,1,0,1,0,1,0
3264,0,10,0,1,1,0,0,0,0,1,1,2,91.10,964.35,1,0,1,0,0,1,1,0,1,0
1653,0,58,0,2,2,2,2,2,2,2,2,3,20.75,1185.95,0,1,0,1,1,0,1,0,0,1
2607,1,1,0,1,1,0,1,1,1,1,1,2,69.75,69.75,0,1,1,0,1,0,1,0,1,0


## Numerical Features

In [152]:
X_train["TotalCharges"].dtypes

dtype('O')

In [153]:
X_train['TotalCharges'] = pd.to_numeric(X_train['TotalCharges'], errors='coerce')

In [154]:
X_train.dtypes

SeniorCitizen             int64
tenure                    int64
MultipleLines             int64
InternetService           int64
OnlineSecurity            int64
OnlineBackup              int64
DeviceProtection          int64
TechSupport               int64
StreamingTV               int64
StreamingMovies           int64
Contract                  int64
PaymentMethod             int64
MonthlyCharges          float64
TotalCharges            float64
gender_Female             int64
gender_Male               int64
Partner_No                int64
Partner_Yes               int64
Dependents_No             int64
Dependents_Yes            int64
PhoneService_Yes          int64
PhoneService_No           int64
PaperlessBilling_Yes      int64
PaperlessBilling_No       int64
dtype: object

In [155]:
X_train['TotalCharges'] = X_train['TotalCharges'].replace(np.nan, 0)

In [156]:
yeo_transformer = YeoJohnsonTransformer(variables=['TotalCharges'])

X_train = yeo_transformer.fit_transform(X_train)

In [157]:
X_test['TotalCharges'] = pd.to_numeric(X_test['TotalCharges'], errors='coerce')

In [158]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1200 entries, 4320 to 4754
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   SeniorCitizen         1200 non-null   int64  
 1   tenure                1200 non-null   int64  
 2   MultipleLines         1200 non-null   int64  
 3   InternetService       1200 non-null   int64  
 4   OnlineSecurity        1200 non-null   int64  
 5   OnlineBackup          1200 non-null   int64  
 6   DeviceProtection      1200 non-null   int64  
 7   TechSupport           1200 non-null   int64  
 8   StreamingTV           1200 non-null   int64  
 9   StreamingMovies       1200 non-null   int64  
 10  Contract              1200 non-null   int64  
 11  PaymentMethod         1200 non-null   int64  
 12  MonthlyCharges        1200 non-null   float64
 13  TotalCharges          1199 non-null   float64
 14  gender_Female         1200 non-null   int64  
 15  gender_Male           1

In [159]:
X_test['TotalCharges'] = X_test['TotalCharges'].replace(np.nan, 0)

In [160]:
X_test = yeo_transformer.transform(X_test)

In [161]:
y_train

3381     No
31       No
1596     No
1386     No
4237    Yes
       ... 
4931     No
3264     No
1653     No
2607    Yes
2732     No
Name: Churn, Length: 4800, dtype: object

In [162]:
le = LabelEncoder()
le.fit(y_train)

In [163]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [164]:
y_train

array([0, 0, 0, ..., 0, 1, 0])

In [165]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train)

In [166]:
X_train = pd.DataFrame(min_max_scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(min_max_scaler.transform(X_test), columns=X_test.columns)

In [167]:
param_C = 0.8
param_max_iter = 100
clf = LogisticRegression(C=param_C, max_iter=param_max_iter, random_state=0)

In [168]:
# train on all set and evaluate on test
clf.fit(X_train, y_train)

In [169]:
y_test_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

In [170]:
test_accuracy

0.7908333333333334

In [171]:
test_f1

0.5557522123893806

In [172]:
cv_accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
cv_f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro').mean()

In [173]:
print(f"CV accuracy: {cv_accuracy:.2f}, Test accuracy: {test_accuracy:.2f}\n"
      f"CV f1: {cv_f1:.2f}, Test f1: {test_f1:.2f}")

CV accuracy: 0.80, Test accuracy: 0.79
CV f1: 0.72, Test f1: 0.56


In [174]:
y_test_pred

array([0, 0, 1, ..., 0, 0, 0])

In [175]:
y_test

array([0, 1, 0, ..., 0, 1, 0])

In [176]:
clf.get_params()

{'C': 0.8,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [179]:
import pickle
filename = "../models/LogisticRegression1.pkl"
pickle.dump(clf, open(filename,'wb'))

In [180]:
import mlflow

ModuleNotFoundError: No module named 'mlflow'