In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# Import data

In [19]:
df = pd.read_csv('files_for_lab/customer_churn.csv')

In [20]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


# Data cleaning

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [22]:
columns = df.columns

In [23]:
for column in columns:
    print((column).upper())
    print(f'{df[column].value_counts()} \n')

CUSTOMERID
7590-VHVEG    1
3791-LGQCY    1
6008-NAIXK    1
5956-YHHRX    1
5365-LLFYV    1
             ..
9796-MVYXX    1
2637-FKFSY    1
1552-AAGRX    1
4304-TSPVK    1
3186-AJIEK    1
Name: customerID, Length: 7043, dtype: int64 

GENDER
Male      3555
Female    3488
Name: gender, dtype: int64 

SENIORCITIZEN
0    5901
1    1142
Name: SeniorCitizen, dtype: int64 

PARTNER
No     3641
Yes    3402
Name: Partner, dtype: int64 

DEPENDENTS
No     4933
Yes    2110
Name: Dependents, dtype: int64 

TENURE
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: tenure, Length: 73, dtype: int64 

PHONESERVICE
Yes    6361
No      682
Name: PhoneService, dtype: int64 

MULTIPLELINES
No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64 

INTERNETSERVICE
Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64 

ONLINESECURITY
No              

The data is mainly made out of categorical values, but easy to encode.  
We can drop the costumerid column, as it duplicates the indexing of each variable.

In [25]:
df = df.drop('customerID', axis=1)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


# X-y split

In [73]:
# We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen, MonthlyCharges.

X = df[['SeniorCitizen', 'MonthlyCharges', 'tenure']]
y = df['Churn']

In [46]:
X.head(5)

Unnamed: 0,SeniorCitizen,MonthlyCharges,tenure
0,0,29.85,1
1,0,56.95,34
2,0,53.85,2
3,0,42.3,45
4,0,70.7,2


In [74]:
y.head(5)

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

In [76]:
y.shape

(7043,)

In [77]:
y.value_counts(normalize=True)

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

The target variable is clearly imbalanced. The model will learn better ways to classify the Nos than the Yess, unless the data is over or downsampled.

# Train-test split

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [79]:
y_train.value_counts(normalize=True)

No     0.733582
Yes    0.266418
Name: Churn, dtype: float64

We will have to find ways to balance the train set.

In [80]:
y_test.value_counts(normalize=True)

No     0.738822
Yes    0.261178
Name: Churn, dtype: float64

The imbalance of the data is preserved in the test set, which is desirable.

# Scaling of data

In [81]:
scaler = MinMaxScaler()

In [82]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a function for the LogReg

In [83]:
def logistic_regression_report(variable_data, target_data):
    '''Allows to run the Logistic Regression model and Classification Report for a given variable data and target data'''
    model = LogisticRegression()
    model.fit(variable_data, target_data)
    pred_train = model.predict(variable_data)
    pred_test = model.predict(X_test)
    print('TRAIN SET\nclassification report\n', classification_report(target_data, pred_train))
    print('TEST SET\nclassification report\n', classification_report(y_test, pred_test))


# Attempt 1: without balance

In [84]:
logistic_regression_report(X_train, y_train)

TRAIN SET
classification report
               precision    recall  f1-score   support

          No       0.82      0.91      0.87      4133
         Yes       0.66      0.46      0.54      1501

    accuracy                           0.79      5634
   macro avg       0.74      0.69      0.70      5634
weighted avg       0.78      0.79      0.78      5634

TEST SET
classification report
               precision    recall  f1-score   support

          No       0.82      0.90      0.86      1041
         Yes       0.62      0.46      0.53       368

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409



# Attempt 2: with SMOTE oversampling

In [94]:
sm = SMOTE(k_neighbors=3)
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [87]:
logistic_regression_report(X_train_SMOTE, y_train_SMOTE)

TRAIN SET
classification report
               precision    recall  f1-score   support

          No       0.74      0.73      0.74      4133
         Yes       0.74      0.75      0.74      4133

    accuracy                           0.74      8266
   macro avg       0.74      0.74      0.74      8266
weighted avg       0.74      0.74      0.74      8266

TEST SET
classification report
               precision    recall  f1-score   support

          No       0.88      0.72      0.79      1041
         Yes       0.47      0.71      0.57       368

    accuracy                           0.72      1409
   macro avg       0.67      0.72      0.68      1409
weighted avg       0.77      0.72      0.73      1409



# Attempt 3: with TomeKLinks downsampling

In [106]:
tl = TomekLinks(sampling_strategy='all')
X_train_TL, y_train_TL = tl.fit_resample(X_train, y_train)

In [107]:
print('TRAIN')
print('X_train:', X_train.shape)
print('X_train_TL:',X_train_TL.shape)

TRAIN
X_train: (5634, 3)
X_train_TL: (4856, 3)


In [108]:
y_train_TL.value_counts(normalize=True)

No     0.771005
Yes    0.228995
Name: Churn, dtype: float64

In [92]:
logistic_regression_report(X_train_TL, y_train_TL)

TRAIN SET
classification report
               precision    recall  f1-score   support

          No       0.83      0.90      0.86      3744
         Yes       0.68      0.53      0.59      1501

    accuracy                           0.79      5245
   macro avg       0.75      0.71      0.73      5245
weighted avg       0.78      0.79      0.79      5245

TEST SET
classification report
               precision    recall  f1-score   support

          No       0.83      0.85      0.84      1041
         Yes       0.55      0.51      0.53       368

    accuracy                           0.76      1409
   macro avg       0.69      0.68      0.69      1409
weighted avg       0.76      0.76      0.76      1409



# Conclusions

The predictions done in the **baseline** model were predictable.  
- The accuracy is 79-78% in the train-test set, but its accuracy is clearly higher for the Yes than the No, because the data is imabalanced, with a 86-53% for the predictes Yes-No.  
Trying to balance the data has given some different results:  
- Using the **SMOTE** the model has lost accuracy and overfitted a bit more 74-72% in the train-test, but it has reduced the imbalance of the predicted values up to 79-57%.  
- Using the **TomekLinks**, the results are not as good as with the SMOTE. The overall accuracy is better with a 79-76% for the train-test, but was not successful to cope with the imbalance of the data for the predicted values.