In [None]:
0import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE


import random
import joblib

In [None]:
class Config:
    seed = 42
    train_size = 0.8

cfg = Config()

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

set_seed(cfg.seed)

In [None]:
df = pd.read_csv("./WA_Fn-UseC_-Telco-Customer-Churn.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
df = df.drop(columns=["customerID"])

In [None]:
train_df, test_df = train_test_split(df, train_size=cfg.train_size, random_state=cfg.seed)

In [None]:
label_encoder = LabelEncoder()

label_apply_on = ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]

for col in label_apply_on:
    train_df[col] = label_encoder.fit_transform(train_df[col])
    test_df[col] = label_encoder.transform(test_df[col])

In [None]:
onehot_apply_on = ["MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaymentMethod", ]

train_df = pd.get_dummies(train_df, columns=onehot_apply_on, dtype=int)
test_df = pd.get_dummies(test_df, columns=onehot_apply_on, dtype=int)

In [None]:
# TotalCharges is actually a float, but it's being treated as objects
# because of the empty string. Convert it to floats and then fill in the NaN
# with the mean

train_df['TotalCharges'] = pd.to_numeric(train_df['TotalCharges'], errors='coerce')
test_df['TotalCharges'] = pd.to_numeric(test_df['TotalCharges'], errors='coerce')

total_charges_mean = train_df["TotalCharges"].mean()

train_df['TotalCharges'] = train_df['TotalCharges'].fillna(total_charges_mean)
test_df['TotalCharges'] = test_df['TotalCharges'].fillna(total_charges_mean)

In [None]:
minmax = MinMaxScaler()

train_df[["tenure"]] = minmax.fit_transform(train_df[["tenure"]])
test_df[["tenure"]] = minmax.transform(test_df[["tenure"]])

In [None]:
yeo = PowerTransformer()

train_df[["TotalCharges", "MonthlyCharges"]] = yeo.fit_transform(train_df[["TotalCharges", "MonthlyCharges"]])
test_df[["TotalCharges", "MonthlyCharges"]] = yeo.transform(test_df[["TotalCharges", "MonthlyCharges"]])

In [None]:
x_train = train_df.drop(columns=["Churn"])
y_train = train_df["Churn"]

x_test = test_df.drop(columns=["Churn"])
y_test = test_df["Churn"]

In [None]:
smote = SMOTE(sampling_strategy='minority', random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)

In [None]:
random_forest = RandomForestClassifier(random_state=cfg.seed)
random_forest.fit(x_train, y_train)

random_forest_pred = random_forest.predict(x_test)

print(classification_report(random_forest_pred, y_test))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      1019
           1       0.60      0.58      0.59       390

    accuracy                           0.78      1409
   macro avg       0.72      0.72      0.72      1409
weighted avg       0.77      0.78      0.78      1409



In [None]:
gradient_boosting = GradientBoostingClassifier(random_state=cfg.seed)
gradient_boosting.fit(x_train, y_train)

gradient_boosting_pred = gradient_boosting.predict(x_test)

print(classification_report(gradient_boosting_pred, y_test))

              precision    recall  f1-score   support

           0       0.77      0.91      0.83       881
           1       0.78      0.55      0.65       528

    accuracy                           0.78      1409
   macro avg       0.78      0.73      0.74      1409
weighted avg       0.78      0.78      0.76      1409



In [None]:
svc = SVC(random_state=cfg.seed, probability=True)
svc.fit(x_train, y_train)

svc_pred = svc.predict(x_test)

print(classification_report(svc_pred, y_test))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       920
           1       0.75      0.57      0.65       489

    accuracy                           0.79      1409
   macro avg       0.77      0.74      0.75      1409
weighted avg       0.78      0.79      0.78      1409



In [None]:
meta_input1 = cross_val_predict(random_forest, x_train, y_train, cv=5, method='predict_proba')
meta_input2 = cross_val_predict(gradient_boosting, x_train, y_train, cv=5, method='predict_proba')
meta_input3 = cross_val_predict(svc, x_train, y_train, cv=5, method='predict_proba')

stacked_features = np.hstack((meta_input1, meta_input2, meta_input3))

In [None]:
meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_train)

meta_input1 = random_forest.predict_proba(x_test)
meta_input2 = gradient_boosting.predict_proba(x_test)
meta_input3 = svc.predict_proba(x_test)

stacked_features = np.hstack((meta_input1, meta_input2, meta_input3))

meta_model_pred = meta_model.predict(stacked_features)

print(classification_report(meta_model_pred, y_test))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1043
           1       0.57      0.58      0.58       366

    accuracy                           0.78      1409
   macro avg       0.71      0.72      0.71      1409
weighted avg       0.78      0.78      0.78      1409



In [None]:
joblib.dump(random_forest, 'random_forest.pkl')
joblib.dump(gradient_boosting, 'gradient_boosting.pkl')
joblib.dump(svc, 'svc.pkl')

joblib.dump(meta_model, 'meta_model_logreg.pkl')

['meta_model_logreg.pkl']

In [None]:
joblib.dump({"minmax": minmax, "yeo": yeo}, "scaler.pkl")

['scaler.pkl']