## Import all the libraries

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt
import xgboost as xgb
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing


## Read the dataset

In [3]:
data= pd.read_csv('Telco_Churn.csv')

## View the dimensions of the dataset

In [4]:
data.shape

(7043, 21)

In [5]:
data.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Separating into Features & Target Variable

In [8]:
data= data.drop('TotalCharges',axis=1)

In [9]:
data= data.drop('customerID',axis=1)

In [10]:
data.shape

(7043, 19)

In [11]:
data.head(1)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,No


In [12]:
data['Churn']= data['Churn'].map({'Yes':1, 'No':0})

In [13]:
data.head(1)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,0


In [14]:
data['SeniorCitizen']= data['SeniorCitizen'].astype('object')

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   object 
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [16]:
X= data.drop('Churn', axis=1)
y= data['Churn']

In [17]:
X.head(1)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85


In [18]:
numerical_features = X.select_dtypes(exclude= ['object'])

In [19]:
numerical_features.head(1)

Unnamed: 0,tenure,MonthlyCharges
0,1,29.85


In [20]:
names_numerical = numerical_features.columns.tolist()

In [21]:
names_numerical

['tenure', 'MonthlyCharges']

In [26]:
min_max_scaler = preprocessing.MinMaxScaler()

In [27]:
numerical_features = min_max_scaler.fit_transform(numerical_features)

In [28]:
numerical_features

array([[0.01388889, 0.11542289],
       [0.47222222, 0.38507463],
       [0.02777778, 0.35422886],
       ...,
       [0.15277778, 0.11293532],
       [0.05555556, 0.55870647],
       [0.91666667, 0.86965174]])

In [29]:
numerical_features = pd.DataFrame(numerical_features)

In [30]:
numerical_features

Unnamed: 0,0,1
0,0.013889,0.115423
1,0.472222,0.385075
2,0.027778,0.354229
3,0.625000,0.239303
4,0.027778,0.521891
...,...,...
7038,0.333333,0.662189
7039,1.000000,0.845274
7040,0.152778,0.112935
7041,0.055556,0.558706


In [31]:
numerical_features.columns = names_numerical

In [32]:
numerical_features

Unnamed: 0,tenure,MonthlyCharges
0,0.013889,0.115423
1,0.472222,0.385075
2,0.027778,0.354229
3,0.625000,0.239303
4,0.027778,0.521891
...,...,...
7038,0.333333,0.662189
7039,1.000000,0.845274
7040,0.152778,0.112935
7041,0.055556,0.558706


In [34]:
categorical_features = X.select_dtypes(include= ['object'])

In [35]:
categorical_features

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic)
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check
7039,Female,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic)
7040,Female,0,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check
7041,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check


In [36]:
categorical_features = pd.get_dummies(categorical_features, columns=categorical_features.columns.tolist())

  categorical_features = pd.get_dummies(categorical_features, columns=categorical_features.columns.tolist())


In [37]:
categorical_features

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,0,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
1,0,1,1,0,1,0,1,0,0,1,...,0,0,1,0,1,0,0,0,0,1
2,0,1,1,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,1,1,0,1,0,1,0,1,0,...,0,0,1,0,1,0,1,0,0,0
4,1,0,1,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,0,0,1,0,1,0,1,...,1,0,1,0,0,1,0,0,0,1
7039,1,0,1,0,0,1,0,1,0,1,...,1,0,1,0,0,1,0,1,0,0
7040,1,0,1,0,0,1,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
7041,0,1,0,1,0,1,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1


## Transformed dataset with numerical feature and one-hot enecoded categorical feature

In [38]:
X_scale= pd.concat([numerical_features, categorical_features], axis=1)

In [39]:
X_scale.head(2)

Unnamed: 0,tenure,MonthlyCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.013889,0.115423,1,0,1,0,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
1,0.472222,0.385075,0,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,1


## Splitting into Training & Testing Data

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.20)

## Making the predictions

In [41]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
classifier_DT = DecisionTreeClassifier()

In [43]:
classifier_DT.fit(X_train, y_train)

In [44]:
y_pred = classifier_DT.predict(X_test)

In [66]:
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn import svm
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [48]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1030
           1       0.51      0.50      0.50       379

    accuracy                           0.73      1409
   macro avg       0.66      0.66      0.66      1409
weighted avg       0.73      0.73      0.73      1409



In [49]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7338537970191625


In [50]:
classifier_RF = RandomForestClassifier()

In [51]:
classifier_RF.fit(X_train, y_train)

In [52]:
y_pred = classifier_RF.predict(X_test)

In [55]:
accuracy_RF = accuracy_score(y_test, y_pred)

In [56]:
accuracy_RF

0.7750177430801988

In [57]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7750177430801988


In [58]:
from xgboost import XGBClassifier

In [59]:
classifier_xg = XGBClassifier(n_estimators=100, random_state=42)

In [60]:
classifier_xg.fit(X_train, y_train)

In [61]:
y_pred = classifier_xg.predict(X_test)

In [62]:
accuracy_xg = accuracy_score(y_test, y_pred)

In [63]:
accuracy_xg

0.794180269694819