In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dtypes

TotalCharges is of 'Object' datatype. Looking at the data set it should be of 'float' data type

Convert 'TotalCharges' to numeric data type

In [None]:
pd.to_numeric(df['TotalCharges'])

ValueError: Unable to parse string " " at position 488. 

In [None]:
df.iloc[488]

The error is because of whitespace in the 'TotalCharges' column. If there is a missing observation pandas would have filled with NaN but since there is a whitespace character the entire feature is converted to string data type.

Replace the whitespace with 'NaN'

In [None]:
df = df.replace('^\s*$',np.nan, regex = True)

In [None]:
df.isnull().sum()

Now we are able to see 11 observations as NaN

In [None]:
df.shape

In [None]:
df.dropna(axis = 0 ,inplace = True)
df.shape #11 missing observations are removed

In [None]:
df.isnull().sum()

There are no null values in the data now.

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [None]:
df['Churn'].value_counts()

This shows that there is slight imbalance in the dataset but it is not very big. So moving forward with this.

Since most of the observations are categorical with only two categories 'Yes', 'No'. we use a mapping function to convert them to discrete rather than using df['Colname'].replace() everytime.

In [None]:
df.head()

In [None]:
col = ['Partner','Dependents','PhoneService','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']

Using .value_counts() we can find the number of categories inside a categorical variable.

col_2 indicate variables with two categories and col_3 indicate variables with 3 categories

In [None]:
col_2 = ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']

In [None]:
df['Partner'].value_counts()

In [None]:
col_3 = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

In [None]:
df['StreamingMovies'].value_counts()

for all the variables in col_3 the three categories remain same 'Yes','No','No internet service'

In [None]:
df['gender'].replace(('Male','Female'),(1,0),inplace = True)

In [None]:
df_2 = df[col_2]

In [None]:
df_2.head()

In [None]:
for i,j in enumerate(df_2.columns):
    df[j] = df[j].replace(('Yes','No'),(1,0))

In [None]:
df.head()

In [None]:
df_3 = df[col_3]

In [None]:
for i,j in enumerate(df_3.columns):
    df[j] = df[j].replace(('No internet service','No','Yes'),(0,1,2))

In [None]:
df['MultipleLines'].value_counts()

In [None]:
df['MultipleLines'].replace(('No phone service','No','Yes'),(0,1,2),inplace = True)

In [None]:
df['InternetService'].value_counts()

Fiber optic networks can deliver speeds up to 1 Gpbs (1000 Mbps), whereas DSL speeds typically top out around 6 Mbps. Some domain understanding will help in encoding.

In [None]:
df['InternetService'].replace(('No','DSL','Fiber optic'),(0,1,2),inplace = True)

In [None]:
df['Contract'].value_counts()

In [None]:
df['Contract'].replace(('Month-to-month','One year','Two year'),(1,2,3),inplace = True)

In [None]:
df['PaymentMethod'].value_counts()

We need to do One Hot Encoding for the 'PaymentMethod' for which we will use pd.get_dummies. 'CustomerID' is not useful in predicting the target so we can remove it from the list of independent variables.

In [None]:
del df['customerID']

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df_final = pd.get_dummies(df)

In [None]:
df_final.shape

In [None]:
df_final.columns

In [None]:
y = df_final['Churn']
X = df_final.drop('Churn', axis = 1)

import libraries for machine learning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score,f1_score,confusion_matrix

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size =0.2, shuffle = True)

In [None]:
print(X_train.shape)
print(y_train.shape)

logistic regression

In [None]:
lr = LogisticRegression().fit(X_train,y_train)
lr_pred = lr.predict(X_test)

In [None]:
lr_precision = precision_score(y_test,lr_pred)
lr_recall = recall_score(y_test,lr_pred)
lr_f1 = f1_score(y_test,lr_pred)

In [None]:
print('LR Precision',lr_precision)
print('LR recall', lr_recall)
print('LR F1 score',lr_f1)

RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state =42).fit(X_train,y_train)
rf_pred = rf.predict(X_test)

In [None]:
rf_precision = precision_score(y_test,rf_pred)
rf_recall = recall_score(y_test,rf_pred)
rf_f1 = f1_score(y_test,rf_pred)

In [None]:
print('RF Precision',rf_precision)
print('RF recall', rf_recall)
print('RF F1 score',rf_f1)

GridSearchCV to find the best hyperparameters of the model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
rf = RandomForestClassifier(random_state =42)

In [None]:
rf_params = {
    'min_samples_split':[2,3,4],
    'min_samples_leaf':[1,2],
    'n_estimators' : [100,150,200]
}

In [None]:
GridSearchCV(rf, param_grid = rf_params,verbose = True).fit(X,y).best_params_

In [None]:
rf = RandomForestClassifier(min_samples_leaf= 2, min_samples_split=2,n_estimators=150).fit(X_train,y_train)

In [None]:
rf_pred = rf.predict(X_test)

In [None]:
rf_precision = precision_score(y_test,rf_pred)
rf_recall = recall_score(y_test,rf_pred)
rf_f1 = f1_score(y_test,rf_pred)

In [None]:
print('RF Precision',rf_precision)
print('RF recall', rf_recall)
print('RF F1 score',rf_f1)

using the best hyperparameters for the model results in better model F1-Score

Feature selection using RandomForest

In [None]:
val = rf.feature_importances_

In [None]:
val

In [None]:
imp_var = []
imp_var_val = []
for i,j in zip(X.columns,rf.feature_importances_):
    if j > 0.02:
        imp_var.append(j)
        imp_var_val.append(i)    

In [None]:
col_name = X.columns
plt.barh(col_name,val)
plt.xlabel('RF feature importance')
plt.show()

This graph clearly indicates that the most important variables are

1. tenure

2. TotalCharges

3. MonthlyCharges

4. Contract


Using only the most important variables

In [None]:
y = df['Churn']
X_new = df_final[['tenure','TotalCharges','MonthlyCharges', 'Contract']]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_new,y, test_size =0.2, shuffle = True)

In [None]:
lr = LogisticRegression().fit(X_train,y_train)
lr_pred = lr.predict(X_test)

In [None]:
lr_precision = precision_score(y_test,lr_pred)
lr_recall = recall_score(y_test,lr_pred)
lr_f1 = f1_score(y_test,lr_pred)

In [None]:
print('LR Precision',lr_precision)
print('LR recall', lr_recall)
print('LR F1 score',lr_f1)