In [3]:
import pandas as pd
import numpy as np 
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,mutual_info_classif

In [1]:
from google.colab import files
data_to_load = files.upload()

Saving WA_Fn-UseC_-Telco-Customer-Churn.xls to WA_Fn-UseC_-Telco-Customer-Churn.xls


In [4]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.xls")

In [5]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [7]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

#### As we can see the dtype of totalcharges is an object hence we convert it into floating point numbers

In [8]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].str.strip())
df["TotalCharges"] = df["TotalCharges"].fillna(0)

In [9]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [10]:
# profile = ProfileReport(df, title="Pandas Profiling Report")

In [11]:
# profile.to_file("your_report.html")

#### Dropping the following columns

* customerID = its a unique identifier we can use index for the same 
* Partner is highly correlated with Dependents and vice versa
* tenure is highly correlated with Contract
* PhoneService is highly correlated with MultipleLines and 1 other fields	
* StreamingMovies is highly correlated with MultipleLines and 8 other fields	
* InternetService is highly correlated with MultipleLines and 8 other fields	
* MultipleLines is highly correlated with PhoneService and 8 other fields	
* OnlineSecurity is highly correlated with MultipleLines and 8 other fields	
* StreamingTV is highly correlated with MultipleLines and 8 other fields	
* OnlineBackup is highly correlated with MultipleLines and 8 other fields	
* DeviceProtection is highly correlated with MultipleLines and 8 other fields	
* TechSupport is highly correlated with MultipleLines and 8 other fields	
* Partner is highly correlated with Dependents	
* Dependents is highly correlated with Partner	
* tenure is highly correlated with Contract	
* Contract is highly correlated with tenure and 7 other fields	
* MonthlyCharges is highly correlated with PhoneService and 8 other fields

In [12]:
columns_with_high_corr = ['customerID','Dependents','Contract','PhoneService','StreamingMovies','InternetService','OnlineSecurity','OnlineBackup','StreamingTV','DeviceProtection','TechSupport']

df.drop(labels = columns_with_high_corr, axis = 1, inplace = True)

In [None]:
# profile = ProfileReport(df, title="Pandas Profiling Report")
# profile.to_file("report_After_cleaning.html")

In [13]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
tenure              0
MultipleLines       0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [14]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,tenure,MultipleLines,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,1,No phone service,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,34,No,No,Mailed check,56.95,1889.50,No
2,Male,0,No,2,No,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,45,No phone service,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,2,No,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,24,Yes,Yes,Mailed check,84.80,1990.50,No
7039,Female,0,Yes,72,Yes,Yes,Credit card (automatic),103.20,7362.90,No
7040,Female,0,Yes,11,No phone service,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,4,Yes,Yes,Mailed check,74.40,306.60,Yes


In [15]:
label_encoder = LabelEncoder()

In [16]:
for i in df.columns:
    if df[i].dtype == 'object':
        df[i]= label_encoder.fit_transform(df[i])

In [17]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
tenure              0
MultipleLines       0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [18]:
X = df.drop(labels = ['Churn'], axis = 1)
y = df['Churn']

In [19]:
Six_best_fetures = SelectKBest(mutual_info_classif, k = 6)
Six_best_fetures.fit(X,y)


SelectKBest(k=6, score_func=<function mutual_info_classif at 0x7fb7587e13b0>)

In [20]:
required_fetures = X.columns[Six_best_fetures.get_support()]


In [21]:
columns_to_drop = list(x for x in X.columns if x not in required_fetures)

In [22]:
df.drop(labels = columns_to_drop, axis = 1, inplace = True)

In [23]:
df.head(5)

Unnamed: 0,Partner,tenure,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,1,1,2,29.85,29.85,0
1,0,34,0,3,56.95,1889.5,0
2,0,2,1,3,53.85,108.15,1
3,0,45,0,0,42.3,1840.75,0
4,0,2,1,2,70.7,151.65,1


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Using Decision tree classifier

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7281760113555713

In [27]:
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [28]:
GDC = GridSearchCV(DecisionTreeClassifier(), params, cv = 10, scoring='accuracy')
GDC.fit(X_train, y_train)
print(GDC.best_params_)
print(GDC.best_score_)

{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 100}
0.7882522076515122


In [29]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, min_samples_leaf = 100)

In [None]:
# clf.get_params()

In [30]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10, min_samples_leaf=100)

In [31]:
y_pred = clf.predict(X_test)

In [32]:
accuracy_score(y_test, y_pred)

0.7927608232789212

using randomforest

In [33]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
accuracy_score(y_test , y_pred_rfc)

0.7934705464868701

In [None]:
# rfc.get_params()

In [34]:
forest_params = [{'max_depth': list(range(10, 15)), 'max_features': ['sqrt','log2']}]
GSC = GridSearchCV(RandomForestClassifier(), forest_params, cv = 10, scoring='accuracy')
GSC.fit(X_train, y_train)
print(GSC.best_params_)
print(GSC.best_score_)

{'max_depth': 11, 'max_features': 'log2'}
0.7919752969779423


In [35]:
rfc = RandomForestClassifier(max_depth=11,max_features = 'sqrt')
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

In [None]:
accuracy_score(y_test , y_pred_rfc)

0.78708303761533