In [1]:
#Importing required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from imblearn.datasets import make_imbalance
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_score
from sklearn.utils import resample,shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample
import random
import os

In [2]:
#Allowing Google Drive access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
path = '/content/drive/MyDrive/MSiA432/Project/'
df = pd.read_excel(path + 'Telecom Churn Rate Dataset.xlsx')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,One year,No,Mailed check,56.95,1889.5,0,0,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0,0,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,3,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,0,0,Yes


In [8]:
#Class imbalance
df['Churn'].value_counts(normalize=True)

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

# Data Cleaning

In [9]:
df_2 = df.copy()
df_2['TotalCharges'] = df_2['TotalCharges'].replace(' ',np.nan)
df_2 = df_2.dropna(how='any').reset_index(drop=True)
df_2

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,One year,No,Mailed check,56.95,1889.50,0,0,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0,0,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,0,3,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,0,0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,0,0,No
7028,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,0,5,No
7029,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,0,0,No
7030,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,0,0,Yes


In [10]:
from sklearn.preprocessing import OneHotEncoder
def get_ohe(df,col_name):
    ohe = OneHotEncoder(sparse=False,categories="auto",drop="first")
    ohe.fit(df[col_name])
    temp_df = pd.DataFrame(data=ohe.transform(df[col_name]), columns=ohe.get_feature_names_out())
    df.drop(columns=col_name, axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df,ohe
df_3,ohe_obj = get_ohe(df_2,["gender","Partner", "Dependents","PhoneService","MultipleLines","InternetService","OnlineSecurity",
                            "OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling",
                            "PaymentMethod","Churn"])



In [11]:
df_3.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,gender_Male,Partner_Yes,Dependents_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,7590-VHVEG,0,1,29.85,29.85,0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,5575-GNVDE,0,34,56.95,1889.5,0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3668-QPYBK,0,2,53.85,108.15,0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,7795-CFOCW,0,45,42.3,1840.75,0,3,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9237-HQITU,0,2,70.7,151.65,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [12]:
df_4 = df_3.drop(columns = ['customerID'])
df_4.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,0,0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0,34,56.95,1889.5,0,0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,2,53.85,108.15,0,0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,0,45,42.3,1840.75,0,3,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2,70.7,151.65,0,0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


# Modeling

In [13]:
#Train-test split
train, test = train_test_split(df_4, test_size = 0.2)
print(train.shape)

(5625, 33)


In [14]:
#Upsampling to handle class imbalance
train_minority = train[train['Churn_Yes'] == 1]
train_other = train[train['Churn_Yes'] == 0]

min_upsampled = resample(train_minority,random_state=42,n_samples=len(train_other)-len(train_minority),replace=True)
train_upsampled = pd.concat([min_upsampled,train], axis = 0, ignore_index=True, sort=False)

#Splitting to X, y
train_X = train_upsampled.drop(columns = ['Churn_Yes'])
train_y = train_upsampled['Churn_Yes']

In [15]:
#Making test_X and test_y
test_X = test.drop(columns = ['Churn_Yes'])
test_y = test['Churn_Yes']

In [31]:
# Decision Tree parameter dictionary
dt_params = {'min_samples_leaf': [1,5,10,20]
              #'max_depth':[None,2,10,20,30],
              #'max_features': [None,5,10,15], 
              #'min_impurity_decrease':[0,0.1,0.2,0.3,0.4],
              }

# Parameters for Random Forest
rf_params = {'n_estimators': [500], # Number of trees in random forest
               'max_features': [int(x) for x in np.linspace(3, 5, num = 1)], # Maximum number of levels in tree
               'min_samples_leaf': [int(x) for x in np.linspace(2, 6, num = 1)], # Minimum number of samples required at each leaf node
               'bootstrap': [True], # Method of selecting samples for training each tree.
             'max_depth': [8,10,12,14,15], # Maximum number of levels in tree
                }

#Decision Tree requires pruning via ccp_alpha. Hence, it is not tuned with gridsearchcv.

# #Parameters for gradient boosting
# gb_params = {'learning_rate': [0.01,0.02,0.03,0.04],
#                   #'subsample'    : [1, 0.9, 0.8],
#                   'n_estimators' : [int(x) for x in np.linspace(start = 50, stop = 150, num = 25)], #Number of trees
#                   'max_depth'    : [int(x) for x in np.linspace(start = 4, stop = 8, num = 1)], #Max depth of a tree
#              'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
#                  }

# #Parameters for XGBoost
# xgb_params = {
#     'max_depth': range(2, 10, 2),
#     'max_leaves': range(4,8),
#     'n_estimators': range(60, 220, 20),
#     #"min_child_weight":[1, 3, 5],
#     'learning_rate': [0.1, 0.01, 0.05]}

scoring_metrics = {'f1_score': make_scorer(r2_score)}

In [17]:
#Making folds for CV
#5 fold CV is used since it was taking too long for 10 fold on this data set
folds = KFold(n_splits=10, shuffle = True, random_state = 421)
folds

KFold(n_splits=10, random_state=421, shuffle=True)

### Decision Tree 

In [37]:
#DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
# define search
search = GridSearchCV(dt_model, dt_params, scoring=scoring_metrics['f1_score'], n_jobs=-1, cv=folds)
# execute search
result = search.fit(train_X, train_y)

In [38]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.6645125622054292
Best Hyperparameters: {'min_samples_leaf': 1}


In [39]:
#Checking test score with best model
best_model = RandomForestClassifier(min_samples_leaf = result.best_params_['min_samples_leaf']).fit(train_X, train_y) #, max_features = result.best_params_['max_features'], min_samples_leaf = result.best_params_['min_samples_leaf'], bootstrap = True, max_depth = result.best_params_['max_depth']).fit(train_X, train_y)
predictions = best_model.predict(test_X)

In [40]:
#Metrics on Test set
print('Test F1 score: ', f1_score(test_y, predictions))
print('Test Accuracy Score: ', accuracy_score(test_y, predictions))

Test F1 score:  0.71334214002642
Test Accuracy Score:  0.845771144278607


### Random Forest 

In [32]:
#Random Forest
rf_model = RandomForestClassifier()
# define search
search = GridSearchCV(rf_model, rf_params, scoring=scoring_metrics['f1_score'], n_jobs=-1, cv=folds)
# execute search
result = search.fit(train_X, train_y)

In [33]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.6117686645459705
Best Hyperparameters: {'bootstrap': True, 'max_depth': 15, 'max_features': 3, 'min_samples_leaf': 2, 'n_estimators': 500}


In [34]:
#Checking test score with best model
best_model = RandomForestClassifier(n_estimators = result.best_params_['n_estimators'], max_features = result.best_params_['max_features'], min_samples_leaf = result.best_params_['min_samples_leaf'], bootstrap = True, max_depth = result.best_params_['max_depth']).fit(train_X, train_y)
predictions = best_model.predict(test_X)

In [36]:
#Metrics on Test set
print('Test F1 score: ', f1_score(test_y, predictions))
print('Test Accuracy Score: ', accuracy_score(test_y, predictions))

Test F1 score:  0.7292161520190025
Test Accuracy Score:  0.837953091684435
