In [52]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

In [53]:
df = pd.read_csv("./WA_Fn-UseC_-Telco-Customer-Churn.csv")
df_cleaned = df[~df['TotalCharges'].isin([' ', '', '  '])].drop(columns='customerID', axis=1)
df_cleaned['TotalCharges'] = df_cleaned['TotalCharges'].astype(float)
df_cleaned.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [54]:
label_ = LabelEncoder()

df1 = df_cleaned.copy(deep = True)
# to select the columns which are not nummerical
text_data_features = [i for i in list(df_cleaned.columns) if i not in list(df_cleaned.describe().columns)]
for i in text_data_features :
    df1[i] = label_.fit_transform(df1[i])
    print(i,' : ', df1[i].unique(),' = ', label_.inverse_transform(df1[i].unique()))

gender  :  [0 1]  =  ['Female' 'Male']
Partner  :  [1 0]  =  ['Yes' 'No']
Dependents  :  [0 1]  =  ['No' 'Yes']
PhoneService  :  [0 1]  =  ['No' 'Yes']
MultipleLines  :  [1 0 2]  =  ['No phone service' 'No' 'Yes']
InternetService  :  [0 1 2]  =  ['DSL' 'Fiber optic' 'No']
OnlineSecurity  :  [0 2 1]  =  ['No' 'Yes' 'No internet service']
OnlineBackup  :  [2 0 1]  =  ['Yes' 'No' 'No internet service']
DeviceProtection  :  [0 2 1]  =  ['No' 'Yes' 'No internet service']
TechSupport  :  [0 2 1]  =  ['No' 'Yes' 'No internet service']
StreamingTV  :  [0 2 1]  =  ['No' 'Yes' 'No internet service']
StreamingMovies  :  [0 2 1]  =  ['No' 'Yes' 'No internet service']
Contract  :  [0 1 2]  =  ['Month-to-month' 'One year' 'Two year']
PaperlessBilling  :  [1 0]  =  ['Yes' 'No']
PaymentMethod  :  [2 3 0 1]  =  ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn  :  [0 1]  =  ['No' 'Yes']


In [55]:
mms = MinMaxScaler() # Normalization
# ss = StandardScaler() # Standardization
df1['tenure'] = mms.fit_transform(df1[['tenure']])
df1['MonthlyCharges'] = mms.fit_transform(df1[['MonthlyCharges']])
df1['TotalCharges'] = mms.fit_transform(df1[['TotalCharges']])
df1.drop(columns = ['PhoneService', 'gender','StreamingTV','StreamingMovies','MultipleLines','InternetService'],inplace = True)
df1.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,1,0,0.0,0,2,0,0,0,1,2,0.115423,0.001275,0
1,0,0,0,0.464789,2,0,2,0,1,0,3,0.385075,0.215867,0
2,0,0,0,0.014085,2,2,0,0,0,1,3,0.354229,0.01031,1
3,0,0,0,0.619718,2,0,2,2,1,0,0,0.239303,0.210241,0
4,0,0,0,0.014085,0,0,0,0,0,1,2,0.521891,0.01533,1


In [56]:
SMOTE_ = SMOTE(sampling_strategy = 1, random_state=42)

X = df1.iloc[:, :-1].values
Y = df1['Churn'].values
X, Y = SMOTE_.fit_resample(X, Y)
Counter(Y)

Counter({0: 5163, 1: 5163})

In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [60]:
# Random Forest 
rfc = RandomForestClassifier(n_estimators=700,
        min_samples_split=2,
        min_samples_leaf=2,
        max_features='log2',
        max_depth=11,
        bootstrap=True,
        class_weight='balanced')

rfc.fit(X_train, Y_train)

# Confusion Matrix
Y_pred = rfc.predict(X_test)
print(f"Confusion matrix: \n{confusion_matrix(Y_test, Y_pred)}")
print(f"Classification Report: \n{classification_report(Y_test, Y_pred)}")
print(f"Accuracy: \n{accuracy_score(Y_test, Y_pred)}")

Confusion matrix: 
[[814 223]
 [147 882]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.85      0.78      0.81      1037
           1       0.80      0.86      0.83      1029

    accuracy                           0.82      2066
   macro avg       0.82      0.82      0.82      2066
weighted avg       0.82      0.82      0.82      2066

Accuracy: 
0.8209099709583737


In [62]:
gb = GradientBoostingClassifier(subsample=0.8,
        n_estimators=300, 
        min_samples_split=10,
        min_samples_leaf=2,
        max_features='log2',
        max_depth=7,
        learning_rate=0.5, 
        random_state=42)
gb.fit(X_train, Y_train)
gb_pred = gb.predict(X_test)
print(f"Confusion matrix: \n{confusion_matrix(Y_test, gb_pred)}")
print(f"Classification Report: \n{classification_report(Y_test, gb_pred)}")
print(f"Accuracy: \n{accuracy_score(Y_test, gb_pred)}")

Confusion matrix: 
[[864 173]
 [185 844]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.83      0.83      1037
           1       0.83      0.82      0.83      1029

    accuracy                           0.83      2066
   macro avg       0.83      0.83      0.83      2066
weighted avg       0.83      0.83      0.83      2066

Accuracy: 
0.8267182962245886


In [59]:
#  stacking

base_models = [
    ('rf',  RandomForestClassifier(
        n_estimators=700,
        min_samples_split=2,
        min_samples_leaf=2,
        max_features='log2',
        max_depth=11,
        bootstrap=True,
        class_weight='balanced')),

    ('gb', GradientBoostingClassifier(
        subsample=0.8,
        n_estimators=300, 
        min_samples_split=10,
        min_samples_leaf=2,
        max_features='log2',
        max_depth=7,
        learning_rate=0.5, 
        random_state=42))
]

meta_model = LogisticRegression()

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)


stacking_clf.fit(X_train, Y_train)
Y_pred_3 = stacking_clf.predict(X_test)
print(f"Confusion matrix: \n{confusion_matrix(Y_test, Y_pred_3)}")
print(f"Classification Report: \n{classification_report(Y_test, Y_pred_3)}")
print(f"Accuracy: \n{accuracy_score(Y_test, Y_pred_3)}")

Confusion matrix: 
[[838 199]
 [151 878]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.85      0.81      0.83      1037
           1       0.82      0.85      0.83      1029

    accuracy                           0.83      2066
   macro avg       0.83      0.83      0.83      2066
weighted avg       0.83      0.83      0.83      2066

Accuracy: 
0.8305905130687319


In [66]:
pd.Series(Y_test).value_counts()

0    1037
1    1029
Name: count, dtype: int64