## Requirements
Pandas

Numpy

ML Models (Ensemble models used - Catboot, XGBoost, RandomForest Classifier, Voting classifier)

SKlearn


In [None]:
!pip install catboost



##Loading packages/libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

## Loading Data

Public Dataset - Telco Customer Churn

In [None]:
# Load dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


##Understanding Data

In [None]:
#Check if there are any missing values
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
#Confirm the datatypes of each Feature
df.dtypes

Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


In [None]:
#Total charges has to be a numeric, so convert it
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [None]:
#Check for missing values
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


## Address Missing Values

In [None]:
#Check if the missing values in Total Charges is due to the user being new ie no billings yet ~ Tenure = 0
mask = (df['TotalCharges'].isna()) & (df['tenure'] == 0)
df.loc[mask, 'TotalCharges'] = 0

In [None]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
# Drop customer ID because it doesnt affect the result of prediction
df.drop(['customerID'], axis=1, inplace=True)

##Data Manipulation

In [None]:
#Encode all the categorical Features

df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


cat_features = df.select_dtypes(include='object').columns
encoder = LabelEncoder()
for col in cat_features:
    df[col] = encoder.fit_transform(df[col])


X = df.drop('Churn', axis=1)
y = df['Churn']

In [None]:
"""scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)"""

'scaler = StandardScaler()\nX_scaled = scaler.fit_transform(X)'

##Creation of Evaluation Sets

In [None]:
#Split the data into train,test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

##Loading ML Models and Applying GridSearch for Hyperparameter Tuning

In [None]:
#Grid Search: CatBoost
catboost = CatBoostClassifier(verbose=0, random_state=42)

cat_param_grid = {
    'iterations': [100, 200, 500, 750],
    'learning_rate': [0.01, 0.02, 0.03, 0.05],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [32, 64]
}

cat_grid_search = GridSearchCV(catboost, cat_param_grid, cv=3, scoring='accuracy', n_jobs=-1)




In [None]:
#Train the model
cat_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", cat_grid_search.best_params_)
best_cat = cat_grid_search.best_estimator_


Best Hyperparameters: {'border_count': 64, 'depth': 4, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.03}


In [None]:
#Grid Search: XGBoost
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_params = {
    'n_estimators': [100, 200, 500, 750],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.02, 0.03, 0.05],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring='accuracy', n_jobs=-1)

In [None]:
#Train the model
xgb_grid.fit(X_train, y_train)
print("Best Hyperparameters:", xgb_grid.best_params_)
best_xgb = xgb_grid.best_estimator_

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}


In [None]:
#Grid Search: Random Forest
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200, 500, 750],
    'max_depth': [5, 8, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4]
}
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='accuracy', n_jobs=-1)

In [None]:
rf_grid.fit(X_train, y_train)
print("Best Hyperparameters:", rf_grid.best_params_)
best_rf = rf_grid.best_estimator_

Best Hyperparameters: {'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


##Running a Voting Classifier on the Models with the best results / Hyperparameter sets

In [None]:
#Ensemble Voting
voting = VotingClassifier(
    estimators=[
        ('catboost', best_cat),
        ('xgboost', best_xgb),
        ('randomforest', best_rf)
    ],
    voting='soft', n_jobs=-1
)

In [None]:
voting.fit(X_train, y_train)

##Evaluating the Final Model for results

In [None]:
#Predict the output using the testing set
y_pred = voting.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.8055358410220014

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.68      0.51      0.58       374

    accuracy                           0.81      1409
   macro avg       0.76      0.71      0.73      1409
weighted avg       0.80      0.81      0.80      1409

