In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [2]:
data=pd.read_csv('Telco-Customer-Churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


# Removing Columns that are not relevant

In [4]:
cols_to_drop=['customerID']
data1=data.drop(columns=cols_to_drop)
data1.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
data1.isnull().sum()
#No Null Values Found

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


# Label Encoding for Object Type Cols

In [7]:
object_columns = data1.select_dtypes(include=['object']).columns
data2=data1.copy()
label_encoder = LabelEncoder()

for col in object_columns:
    data2[col] = label_encoder.fit_transform(data2[col])
    
data2.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,2505,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1466,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,157,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1400,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,925,1


# Multicollinearity

In [8]:
data3=data2.copy()
correlation_matrix = data3.corr()
correlation_matrix

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
gender,1.0,-0.001874,-0.001808,0.010517,0.005106,-0.006488,-0.006739,-0.000863,-0.015017,-0.012057,0.000549,-0.006825,-0.006421,-0.008743,0.000126,-0.011754,0.017352,-0.014569,-0.005291,-0.008612
SeniorCitizen,-0.001874,1.0,0.016479,-0.211185,0.016567,0.008576,0.146185,-0.03231,-0.128221,-0.013632,-0.021398,-0.151268,0.030776,0.047266,-0.142554,0.15653,-0.038551,0.220173,0.037653,0.150889
Partner,-0.001808,0.016479,1.0,0.452676,0.379697,0.017706,0.14241,0.000891,0.150828,0.15313,0.16633,0.126733,0.137341,0.129574,0.294806,-0.014877,-0.154798,0.096848,0.059568,-0.150448
Dependents,0.010517,-0.211185,0.452676,1.0,0.159712,-0.001762,-0.024991,0.04459,0.152166,0.091015,0.080537,0.133524,0.046885,0.021321,0.243187,-0.111377,-0.040292,-0.11389,-0.009572,-0.164221
tenure,0.005106,0.016567,0.379697,0.159712,1.0,0.008448,0.343032,-0.030359,0.325468,0.370876,0.371105,0.322942,0.289373,0.296866,0.671607,0.006152,-0.370436,0.2479,0.158523,-0.352229
PhoneService,-0.006488,0.008576,0.017706,-0.001762,0.008448,1.0,-0.020538,0.387436,-0.015198,0.024105,0.003727,-0.019158,0.055353,0.04387,0.002247,0.016505,-0.004184,0.247398,0.083195,0.011942
MultipleLines,-0.006739,0.146185,0.14241,-0.024991,0.343032,-0.020538,1.0,-0.109216,0.007141,0.117327,0.122318,0.011466,0.175059,0.180957,0.110842,0.165146,-0.176793,0.433576,0.114955,0.038037
InternetService,-0.000863,-0.03231,0.000891,0.04459,-0.030359,0.387436,-0.109216,1.0,-0.028416,0.036138,0.044944,-0.026047,0.107417,0.09835,0.099721,-0.138625,0.08614,-0.32326,-0.055724,-0.047291
OnlineSecurity,-0.015017,-0.128221,0.150828,0.152166,0.325468,-0.015198,0.007141,-0.028416,1.0,0.185126,0.175985,0.285028,0.044669,0.055954,0.374416,-0.157641,-0.096726,-0.053878,0.042357,-0.289309
OnlineBackup,-0.012057,-0.013632,0.15313,0.091015,0.370876,0.024105,0.117327,0.036138,0.185126,1.0,0.187757,0.195748,0.147186,0.136722,0.28098,-0.01337,-0.124847,0.119777,0.090756,-0.195525


# Finding the VIF

In [9]:
vif = pd.DataFrame()
vif['variable']= data3.columns

vif['vif'] = [variance_inflation_factor(data3.values,i)
 for i in range(data3.shape[1])]

vif

Unnamed: 0,variable,vif
0,gender,1.932694
1,SeniorCitizen,1.371198
2,Partner,2.820957
3,Dependents,1.953706
4,tenure,7.688406
5,PhoneService,14.254757
6,MultipleLines,2.757963
7,InternetService,4.367309
8,OnlineSecurity,2.264099
9,OnlineBackup,2.444145


# Highly Correlated Columns

In [10]:
highly_corr = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.5:
            colname = correlation_matrix.columns[i]
            highly_corr.add(colname)
highly_corr

{'Contract'}

In [11]:
data3= data3.drop(columns=highly_corr)
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int32  
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int32  
 3   Dependents        7043 non-null   int32  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int32  
 6   MultipleLines     7043 non-null   int32  
 7   InternetService   7043 non-null   int32  
 8   OnlineSecurity    7043 non-null   int32  
 9   OnlineBackup      7043 non-null   int32  
 10  DeviceProtection  7043 non-null   int32  
 11  TechSupport       7043 non-null   int32  
 12  StreamingTV       7043 non-null   int32  
 13  StreamingMovies   7043 non-null   int32  
 14  PaperlessBilling  7043 non-null   int32  
 15  PaymentMethod     7043 non-null   int32  
 16  MonthlyCharges    7043 non-null   float64


# Train & Test

In [12]:
x = data3.iloc[:,:-1].values
y = data3.iloc[:,-1].values

x=pd.DataFrame(x)
dependant_col='Churn'
cols=[col for col in data3.columns if col != dependant_col]
x.columns=cols

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3, random_state=0)

# Decision Tree

In [14]:
clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100,max_depth=3,min_samples_leaf=5)
clf_entropy.fit(x_train, y_train)
y_pred = clf_entropy.predict(x_test)

In [15]:
#Accuracy fro Decision tree
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7865593942262187

# SVM

In [16]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(x_train, y_train)
y_pred = svm_classifier.predict(x_test)

In [17]:
#Accuracy for SVM
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.7988641741599621

# Random Forest

In [22]:
rf = RandomForestClassifier(n_estimators=100, criterion='gini',random_state=0)
rf.fit(x_train,y_train)
rf_acc = rf.score(x_test, y_test)
print("Random Forest accuracy:",rf_acc)

Random Forest accuracy: 0.7941315664931378


In [23]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
gb.fit(x_train, y_train)
gb_acc = gb.score(x_test, y_test)
print("Gradient Boost accuracy:",gb_acc)

Gradient Boost accuracy: 0.7946048272598202


# GridsearchCV and RandomizedsearchCV

In [24]:
params = {'max_depth':[4,5,6,7,8,9,10],
        'criterion':['gini','entropy'],
        'splitter' :['best', 'random'],
        "max_features": [1,2,3,4,5,6,7,8,9],
        "min_samples_leaf": [1,2,3,4,5,6,7,8,9],
        }

## RSCV Decision Trees

In [26]:
tree = DecisionTreeClassifier(random_state=0)
tree_rscv = RandomizedSearchCV(tree, params, n_jobs=-1)
tree_rscv.fit(x_train,y_train)
print(tree_rscv.best_params_)

{'splitter': 'best', 'min_samples_leaf': 4, 'max_features': 3, 'max_depth': 5, 'criterion': 'gini'}


In [27]:
dt = DecisionTreeClassifier(criterion= 'gini', max_depth= 8, max_features= 5, min_samples_leaf= 6, splitter= 'best')
dt.fit(x_train, y_train)
dt.score(x_test, y_test)

0.7827733080927591

## GSCV Decision Trees

In [29]:
tree = DecisionTreeClassifier(random_state=0)
tree_gscv =  GridSearchCV(tree,params,cv=10, n_jobs=-1)
tree_gscv.fit(x_train,y_train)
print(tree_gscv.best_params_)

{'criterion': 'entropy', 'max_depth': 5, 'max_features': 8, 'min_samples_leaf': 9, 'splitter': 'best'}


In [30]:
dt = DecisionTreeClassifier(criterion= 'entropy', max_depth= 8, max_features= 9, min_samples_leaf= 8, splitter= 'best')
dt.fit(x_train, y_train)
dt.score(x_test, y_test)

0.7766209181258874

# Support Vector Machine (SVM)

In [31]:
param = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],}

## RSCV SVM

In [32]:
svm = SVC()
svm_rscv = RandomizedSearchCV(svm, param, n_jobs=-1)
svm_rscv.fit(x_train,y_train)
print(svm_rscv.best_params_)

{'gamma': 0.0001, 'C': 1}


In [33]:
svm = SVC( gamma = 0.01, C = 100)
svm.fit(x_train, y_train)
svm.score(x_test, y_test)

0.7274017983909133

## GSCV SVM

In [34]:
svm = SVC()
svm_gscv = GridSearchCV(svm, param, n_jobs=-1)
svm_gscv.fit(x_train,y_train)
print(svm_gscv.best_params_)

{'C': 1, 'gamma': 0.0001}


In [35]:
svm = SVC( gamma = 0.01, C = 100)
svm.fit(x_train, y_train)
svm.score(x_test, y_test)

0.7274017983909133

# Random Forest

In [36]:
params = {'n_estimators':[50,75,100,125,200],
        'criterion':['gini','entropy'],
        'bootstrap':[True, False]}

## RSCV Random Forest

In [39]:
rf =  RandomForestClassifier(random_state=0)
rf_rscv = RandomizedSearchCV(rf, params, n_jobs=-1)
rf_rscv.fit(x_train,y_train)
print(rf_rscv.best_params_)

{'n_estimators': 125, 'criterion': 'entropy', 'bootstrap': True}


In [40]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=0,bootstrap = True)
rf.fit(x_train,y_train)
rfc_accuracy = rf.score(x_test, y_test)
rf.score(x_test, y_test)

0.7941315664931378

## GSCV Random Forest

In [42]:
rf =  RandomForestClassifier(random_state=0)
rf_gscv = GridSearchCV(rf, params, n_jobs=-1)
rf_gscv.fit(x_train,y_train)
print(rf_gscv.best_params_)

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 75}


In [43]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=0,bootstrap = True)
rf.fit(x_train,y_train)
rfc_accuracy = rf.score(x_test, y_test)
rf.score(x_test, y_test)

0.7941315664931378

# Boosting 

In [44]:
params = {'max_depth'    : [2,3,4,5,6,7],
        'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001],
        'n_estimators':[100,250,500,750]
        }

## RSCV Boosting

In [46]:
gb = GradientBoostingClassifier(random_state=0)
gb_rscv = RandomizedSearchCV(estimator=gb, param_distributions = params, n_jobs=-1)
gb_rscv.fit(x_train, y_train)
print(gb_rscv.best_params_)

{'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.05}


In [48]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05,max_depth=3, random_state=0).fit(x_train, y_train)
gb_accuracy = gb.score(x_test, y_test)
gb.score(x_test, y_test)

0.7941315664931378

## GSCV Boosting

In [50]:
gb = GradientBoostingClassifier(random_state=0)
gb_gscv = GridSearchCV(gb, params, n_jobs=-1)
gb_gscv.fit(x_train, y_train)
print(gb_gscv.best_params_)

{'learning_rate': 0.15, 'max_depth': 2, 'n_estimators': 100}


In [51]:
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01,max_depth=5, random_state=0).fit(x_train, y_train)
gb_accuracy = gb.score(x_test, y_test)
gb.score(x_test, y_test)

0.7893989588263133