In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#To load dataset bank.csv
df=pd.read_csv('telecom_churn.csv')
#to display first 5 records
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
#To check no. of rows and columns in dataset
df.shape

(7043, 21)

In [4]:
#To check null values
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
#To check datatypes
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
df["TotalCharges"].value_counts()

          11
20.2      11
19.75      9
19.9       8
19.65      8
          ..
1192.3     1
436.6      1
1311.3     1
5957.9     1
973.35     1
Name: TotalCharges, Length: 6531, dtype: int64

In [7]:
df["TotalCharges"].replace(" ",np.nan,inplace=True)

In [8]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [9]:
#change datatype of TotalCharges from object to float
df["TotalCharges"]=df["TotalCharges"].astype("float")

In [10]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [11]:
#find the mean of TotalCharges
m=df["TotalCharges"].mean()
#fill null values of TotalCharges from mean of TotalCharges
df["TotalCharges"].fillna(m,inplace=True)
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [12]:
#To remove unwanted column customerID permanently
df.drop("customerID",axis=1,inplace=True)

In [13]:
#separate all object type data and hold in df_cat
df_cat=df.select_dtypes("object")
#separate all numeric type data and hold in df_num
df_num=df.select_dtypes(["float64","int64"])

In [14]:
#To convert object type data to number using LabelEncoder class
from sklearn.preprocessing import LabelEncoder
for col in df_cat:
    #create object of LabelEncoder
    le=LabelEncoder()
    df_cat[col]=le.fit_transform(df_cat[col])

In [15]:
df_cat.head()

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0
1,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,0
2,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,1
3,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,0
4,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1


In [16]:
df_new=pd.concat([df_num,df_cat],axis=1)
df_new.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,0,1,29.85,29.85,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0
1,0,34,56.95,1889.5,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,0
2,0,2,53.85,108.15,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,1
3,0,45,42.3,1840.75,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1


In [17]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     7043 non-null   int64  
 1   tenure            7043 non-null   int64  
 2   MonthlyCharges    7043 non-null   float64
 3   TotalCharges      7043 non-null   float64
 4   gender            7043 non-null   int32  
 5   Partner           7043 non-null   int32  
 6   Dependents        7043 non-null   int32  
 7   PhoneService      7043 non-null   int32  
 8   MultipleLines     7043 non-null   int32  
 9   InternetService   7043 non-null   int32  
 10  OnlineSecurity    7043 non-null   int32  
 11  OnlineBackup      7043 non-null   int32  
 12  DeviceProtection  7043 non-null   int32  
 13  TechSupport       7043 non-null   int32  
 14  StreamingTV       7043 non-null   int32  
 15  StreamingMovies   7043 non-null   int32  
 16  Contract          7043 non-null   int32  


In [18]:
#Select Input and output dataset df
X=df_new.drop("Churn",axis=1)  #input variable
Y=df_new["Churn"]  #output means target

In [19]:
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,0,1,29.85,29.85,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2
1,0,34,56.95,1889.5,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3
2,0,2,53.85,108.15,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3
3,0,45,42.3,1840.75,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0
4,0,2,70.7,151.65,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2


In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=1)

In [21]:
#1. Standardization
from sklearn.preprocessing import StandardScaler

#fit_transform on training data X_train and
#transform on testing data means X_test
#create the object of StandardScaler class
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [22]:
#Dataset is a classifiaction dataset, we predict customer will churn or not
#First Baseline model :
#create a function
def create_model(model):  #user defined parameter as a object
    model.fit(X_train,Y_train)  #training the model
    Y_pred=model.predict(X_test)  #testing the model
    print(classification_report(Y_test,Y_pred))
    print(confusion_matrix(Y_test,Y_pred))
    return model

In [23]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [24]:
#Base Line Model means use Logistic Regression (we predict yes/no values then
#use classification algorithm)
from sklearn.linear_model import LogisticRegression

In [25]:
#create the object of LogisticRegression
lr=LogisticRegression()

In [26]:
#call function for train and test the model
lr=create_model(lr)

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1585
           1       0.63      0.59      0.61       528

    accuracy                           0.81      2113
   macro avg       0.75      0.73      0.74      2113
weighted avg       0.81      0.81      0.81      2113

[[1401  184]
 [ 219  309]]


In [27]:
df["Churn"].value_counts()  #Data Unbalance (imbalance)

No     5174
Yes    1869
Name: Churn, dtype: int64

In [28]:
#if data is imbalance then use sampling technique
#we
'''
There are 2 types of sampling technique
1. Random Over Sampling Technique 2. Random Under Sampling Technique
'''

'\nThere are 2 types of sampling technique\n1. Random Over Sampling Technique 2. Random Under Sampling Technique\n'

In [29]:
#If you have basically reducing the majority class means
#remove record randomly from majority class that is known as

In [30]:
#install package imblearn only one time
#!pip install imblearn

In [31]:
#Use randomoversampling
from imblearn.over_sampling import RandomOverSampler

In [32]:
#RandomOverSampler
#create object of RandomOverSampler() class
ros=RandomOverSampler(random_state=1)

In [33]:
X_train_ros,Y_train_ros=ros.fit_resample(X_train,Y_train)
#fit_resample() inbuild method of RandomOverSampler class

In [34]:
pd.Series(Y_train).value_counts()  #check if not balance

0    3589
1    1341
Name: Churn, dtype: int64

In [35]:
pd.Series(Y_train_ros).value_counts()  #check if balance

0    3589
1    3589
Name: Churn, dtype: int64

In [36]:
#Apply randomoversampling on 30% testing data
X_test_ros,Y_test_ros=ros.fit_resample(X_test,Y_test)

In [37]:
pd.Series(Y_test).value_counts()  #check if not balance in testing test

0    1585
1     528
Name: Churn, dtype: int64

In [38]:
pd.Series(Y_test_ros).value_counts()  #check if apply randomover sampling means balance

0    1585
1    1585
Name: Churn, dtype: int64

In [39]:
#Create a user defined function
def create_model1(model):  #user defined parameter as a object
    model.fit(X_train_ros,Y_train_ros)  #training the model
    Y_pred=model.predict(X_test_ros)  #testing the model
    print(classification_report(Y_test_ros,Y_pred))
    print(confusion_matrix(Y_test_ros,Y_pred))
    return model

In [40]:
#base line model : Logistic Regression
#create the object of LogisticRegression class
lr=LogisticRegression()

In [41]:
#call function
lr=create_model1(lr)

              precision    recall  f1-score   support

           0       0.80      0.73      0.77      1585
           1       0.75      0.82      0.79      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.78      3170
weighted avg       0.78      0.78      0.78      3170

[[1162  423]
 [ 284 1301]]


# Decision Tree

In [42]:
from sklearn.tree import DecisionTreeClassifier

In [43]:
#create the  object of DecisionTreeClassifier class
dt=DecisionTreeClassifier()  #bydefault gini index

In [44]:
dt=create_model1(dt)

              precision    recall  f1-score   support

           0       0.63      0.81      0.71      1585
           1       0.74      0.53      0.62      1585

    accuracy                           0.67      3170
   macro avg       0.68      0.67      0.66      3170
weighted avg       0.68      0.67      0.66      3170

[[1286  299]
 [ 748  837]]


In [45]:
IG=dt.feature_importances_

In [46]:
X.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'gender',
       'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [47]:
dic={'Input':X.columns,'Information_Gain':IG}
df1=pd.DataFrame(dic)
df1.sort_values('Information_Gain',ascending=False)

Unnamed: 0,Input,Information_Gain
16,Contract,0.229891
2,MonthlyCharges,0.216913
3,TotalCharges,0.161491
1,tenure,0.108608
18,PaymentMethod,0.044232
10,OnlineSecurity,0.043394
4,gender,0.022746
6,Dependents,0.020713
0,SeniorCitizen,0.020144
17,PaperlessBilling,0.019326


In [48]:
#Applying pruning technique to reduce overfitting
#max_depth=8
dt1=DecisionTreeClassifier(max_depth=5,random_state=1)

In [49]:
dt1=create_model1(dt1)

              precision    recall  f1-score   support

           0       0.77      0.72      0.74      1585
           1       0.74      0.79      0.76      1585

    accuracy                           0.75      3170
   macro avg       0.75      0.75      0.75      3170
weighted avg       0.75      0.75      0.75      3170

[[1138  447]
 [ 340 1245]]


In [50]:
dt2=DecisionTreeClassifier(min_samples_leaf=70,random_state=1)

In [51]:
dt2=create_model1(dt2)

              precision    recall  f1-score   support

           0       0.77      0.74      0.75      1585
           1       0.75      0.78      0.76      1585

    accuracy                           0.76      3170
   macro avg       0.76      0.76      0.76      3170
weighted avg       0.76      0.76      0.76      3170

[[1168  417]
 [ 347 1238]]


In [59]:
#Useing entropy
dt3=DecisionTreeClassifier(criterion='entropy',max_depth=7,random_state=1)

In [89]:
dt3=create_model1(dt3)

              precision    recall  f1-score   support

           0       0.75      0.72      0.74      1585
           1       0.73      0.77      0.75      1585

    accuracy                           0.74      3170
   macro avg       0.74      0.74      0.74      3170
weighted avg       0.74      0.74      0.74      3170

[[1143  442]
 [ 372 1213]]


In [61]:
dt4=DecisionTreeClassifier(criterion='entropy',min_samples_leaf=50,random_state=1)

In [90]:
dt4=create_model1(dt4)

              precision    recall  f1-score   support

           0       0.77      0.76      0.77      1585
           1       0.76      0.77      0.77      1585

    accuracy                           0.77      3170
   macro avg       0.77      0.77      0.77      3170
weighted avg       0.77      0.77      0.77      3170

[[1211  374]
 [ 369 1216]]


# Random Forest Classifier

In [66]:
from sklearn.ensemble  import RandomForestClassifier

In [84]:
rfc=RandomForestClassifier(n_estimators=10,max_features=7,min_samples_leaf=30,random_state=1)

In [91]:
rfc=create_model1(rfc)

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1585
           1       0.77      0.78      0.77      1585

    accuracy                           0.77      3170
   macro avg       0.77      0.77      0.77      3170
weighted avg       0.77      0.77      0.77      3170

[[1205  380]
 [ 345 1240]]


# Boosting 

In [86]:
from sklearn.ensemble import AdaBoostClassifier

In [87]:
ada=AdaBoostClassifier(n_estimators=100,random_state=1)

In [92]:
ada=create_model1(ada)

              precision    recall  f1-score   support

           0       0.80      0.73      0.76      1585
           1       0.75      0.82      0.78      1585

    accuracy                           0.77      3170
   macro avg       0.77      0.77      0.77      3170
weighted avg       0.77      0.77      0.77      3170

[[1154  431]
 [ 291 1294]]


In [96]:
from sklearn.ensemble import GradientBoostingClassifier

In [98]:
gbc=GradientBoostingClassifier(n_estimators=100,random_state=1)

In [100]:
gbc=create_model1(gbc)

              precision    recall  f1-score   support

           0       0.80      0.74      0.77      1585
           1       0.75      0.82      0.78      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.77      3170
weighted avg       0.78      0.78      0.77      3170

[[1165  420]
 [ 293 1292]]


In [102]:
from xgboost import XGBClassifier

In [106]:
xgb=XGBClassifier(n_estimators=10,reg_alpha=1,random_state=1)

In [107]:
xgb=create_model1(xgb)

              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1585
           1       0.76      0.80      0.78      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.78      3170
weighted avg       0.78      0.78      0.78      3170

[[1189  396]
 [ 316 1269]]


# K-Nearest Neighbor

In [109]:
from sklearn.neighbors import KNeighborsClassifier

In [118]:
knn=KNeighborsClassifier(n_neighbors=5,metric="minkowski",p=2)

In [119]:
knn=create_model1(knn)

              precision    recall  f1-score   support

           0       0.73      0.66      0.69      1585
           1       0.69      0.76      0.72      1585

    accuracy                           0.71      3170
   macro avg       0.71      0.71      0.71      3170
weighted avg       0.71      0.71      0.71      3170

[[1045  540]
 [ 384 1201]]


# Support Vector Machine

In [121]:
from sklearn.svm import LinearSVC

In [122]:
svc=LinearSVC(random_state=1)

In [123]:
svc=create_model1(svc)

              precision    recall  f1-score   support

           0       0.82      0.72      0.77      1585
           1       0.75      0.84      0.79      1585

    accuracy                           0.78      3170
   macro avg       0.78      0.78      0.78      3170
weighted avg       0.78      0.78      0.78      3170

[[1143  442]
 [ 259 1326]]


# Navie Baised Therom

In [127]:
from sklearn.naive_bayes import GaussianNB

In [128]:
nb=GaussianNB()

In [130]:
nb=create_model(nb)

              precision    recall  f1-score   support

           0       0.91      0.76      0.83      1585
           1       0.52      0.78      0.62       528

    accuracy                           0.76      2113
   macro avg       0.71      0.77      0.73      2113
weighted avg       0.81      0.76      0.78      2113

[[1204  381]
 [ 118  410]]


# Support Vector Machine Algo is giving good score 