In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import roc_auc_score,accuracy_score,confusion_matrix

from sklearn.preprocessing import StandardScaler,MinMaxScaler

import warnings

In [None]:
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

In [None]:
df.shape

    The data contains 7043 rows of data with 21 different features.

In [None]:
df.info()

    Most of the columns are non-numeric which we need to encode to numeric. The 'Churn' column is the target

    TotalCharges is non-numeric. So convert into numeric datatype and check for any missing values in raw data.

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
df.isnull().sum()

In [None]:
df['TotalCharges'].fillna(0,inplace=True)
df = df[df['TotalCharges'] !=0]

    11 rows of TotalCharges are null. Replace it with 0 and remove them as its a small data

    A look at the non-numeric columns and their unique values

In [None]:
obj_cols = df.select_dtypes(include='O').columns
obj_cols = obj_cols.drop(['customerID'])
for col in obj_cols:
    print(col,':',df[col].unique())

# Data Visualization

In [None]:
cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'InternetService', 'Contract',
       'PaperlessBilling', 'PaymentMethod']
for col in cols:
    plt.figure(figsize=(10, 4))
    plt.title(col)
    ax = sns.countplot(data=df, x=col, hue='Churn')
    plt.show()

    Observations from above visuals - 
    
    Customers opting for Electronic payment are more probable churn.
    Customers with Paperless Billing are more probable to churn.
    Short term contract customers are more likely to churn.
    Customers with Fiber Optics internet service have high churn rate

In [None]:
df1 = df.drop(['customerID', 'Churn', 'TotalCharges','tenure', 'MonthlyCharges'],axis=1)

plt.figure(figsize=(24, 12))
corr = df1.apply(lambda x: pd.factorize(x)[0]).corr()
ax = sns.heatmap(corr, xticklabels=corr.columns, annot=True,yticklabels=corr.columns, 
                 linewidths=.2, cmap="YlGnBu")


    PhoneService and MultipleLines have some correlation. InternetService is related to OnlineSecurity, Online 
    Backup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies 

    Encode the non-numeric values into numeric labels

In [None]:
df['gender'] = df['gender'].map({'Female':0,'Male':1})
df['gender'].unique()

In [None]:
cols = ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']
for c in cols:
    df[c] = df[c].map({'Yes':1,'No':0})
    print(c,':',df[c].unique())

    Looking into the column PhoneService and MultipleLines

In [None]:
df[(df.PhoneService==0) & (df.MultipleLines == 'No phone service')]

Seems redundant to have 'No phone service' in MultipleLines as PhoneService(0 or 1) already conveys the same. We can use dummy or one hot encoding to get a seperate column and then delete the unwanted one

In [None]:
df = pd.concat([df,pd.get_dummies(df['MultipleLines'],prefix='MultipleLines')],axis=1).drop('MultipleLines',axis=1)
df.drop('MultipleLines_No phone service',axis = 1, inplace=True)
df.head()

In [None]:
df['InternetService'] = df['InternetService'].map({'No':0,'DSL':1,'Fiber optic':1})

    Some columns are related to Internet Service. One hot encoding and dropping the no internet column which would be 
    redundant.

In [None]:
cols = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

for col in cols:
    df = pd.concat([df,pd.get_dummies(df[col],prefix=col)],axis=1).drop(col,axis=1)

df.drop(['OnlineSecurity_No internet service','OnlineBackup_No internet service','DeviceProtection_No internet service',
             'TechSupport_No internet service','StreamingTV_No internet service','StreamingMovies_No internet service'],
            axis=1,inplace=True)

    Encoding the Contract and PaymentMethod

In [None]:
df['Contract'] = df['Contract'].map({'Month-to-month':0, 'One year':1, 'Two year':2})
df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check':0, 'Mailed check':1, 'Bank transfer (automatic)':2,
                                           'Credit card (automatic)':3})

In [None]:
df['Tenure'] = pd.cut(df['tenure'],bins=4,labels=['<20','20-40','40-60','>60'])
df.drop(['tenure'],axis=1,inplace=True)
print(df['Tenure'].value_counts())


In [None]:
df = pd.concat( [df,pd.get_dummies(df['Tenure'],prefix='Tenure')],axis=1).drop('Tenure',axis=1)

In [None]:
sc = StandardScaler()
df['MonthlyCharges'] = sc.fit_transform(df[['MonthlyCharges']].values)
df['TotalCharges'] = sc.fit_transform(df[['TotalCharges']].values)

#mm = MinMaxScaler()
#df['MonthlyCharges'] = mm.fit_transform(df[['MonthlyCharges']].values)
#df['TotalCharges'] = mm.fit_transform(df[['TotalCharges']].values)


In [None]:
X = df.drop(['customerID','Churn'],axis=1)
y = df['Churn']
y.value_counts()

    Looks like a imbalanced data as there are less Churn customers

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.3)


    Checking with Decision Tree, RandomForest and LogisticRegression (without tuning the model parameters)

In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
dt_pred  =dt_model.predict(X_test)

dt_roc_auc_score_default = roc_auc_score(y_test, dt_pred)
dt_accuracy_default = accuracy_score(y_test, dt_pred)

print(dt_accuracy_default)
print(confusion_matrix(y_test, dt_pred))

In [None]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train,y_train)
rfc_pred =  rfc_model.predict(X_test)

rfc_roc_auc_score_default = roc_auc_score(y_test, rfc_pred)
rfc_accuracy_default = accuracy_score(y_test, rfc_pred)

print(rfc_accuracy_default)
print(confusion_matrix(y_test, rfc_pred))

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train,y_train)
log_pred =  log_model.predict(X_test)

log_roc_auc_score_default = roc_auc_score(y_test, log_pred)
log_accuracy_default = accuracy_score(y_test, log_pred)

print(log_accuracy_default)
print(confusion_matrix(y_test, log_pred))

    Exploring the important features

    Parameter Tuning

In [None]:
param_grid = {"criterion":['gini','entropy'], 
              "max_depth":[5,10,15,20]
             }    
grid = GridSearchCV(DecisionTreeClassifier(), param_grid,verbose=True)
grid.fit(X_train,y_train)
best_param = grid.best_params_
best_param

In [None]:
dt_model = DecisionTreeClassifier(criterion=best_param['criterion'],max_depth=best_param['max_depth'])
dt_model.fit(X_train,y_train)
dt_pred  =dt_model.predict(X_test)
dt_roc_auc_score = roc_auc_score(y_test, dt_pred)
dt_accuracy = accuracy_score(y_test, dt_pred)

print(dt_accuracy)

print(confusion_matrix(y_test, dt_pred))

In [None]:
param_grid = {"n_estimators":[5,20,50], 'max_depth':range(5,16,5), 'min_samples_split':range(200,1001,500),
              'min_samples_leaf':range(30,71,20), 
             }    
grid = GridSearchCV(RandomForestClassifier(), param_grid,verbose=True)
grid.fit(X_train,y_train)
best_param = grid.best_params_
best_param

In [None]:
rfc_model = RandomForestClassifier(max_depth = best_param['max_depth'],
                                   min_samples_leaf = best_param['min_samples_leaf'],
                                   min_samples_split = best_param['min_samples_split'],
                                   n_estimators = best_param['n_estimators'])
rfc_model.fit(X_train,y_train)
rfc_pred =  rfc_model.predict(X_test)
rfc_roc_auc_score = roc_auc_score(y_test, rfc_pred)
rfc_accuracy = accuracy_score(y_test, rfc_pred)
print(rfc_accuracy)

print(confusion_matrix(y_test, rfc_pred))

In [None]:
#param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
param_grid={'C': np.logspace(-3, 0, 20)}
grid = GridSearchCV(LogisticRegression(), param_grid)
grid.fit(X_train,y_train)
best_param = grid.best_params_
best_param

In [None]:
log_model = LogisticRegression(C = best_param['C'])
log_model.fit(X_train,y_train)
log_pred =  log_model.predict(X_test)
log_roc_auc_score = roc_auc_score(y_test, log_pred)
log_accuracy = accuracy_score(y_test, log_pred)
print(log_roc_auc_score)
print(log_accuracy)
print(confusion_matrix(y_test, log_pred))

In [None]:
models_scores = pd.DataFrame({'roc_auc_score_default':[log_roc_auc_score_default,dt_roc_auc_score_default,rfc_roc_auc_score_default],
                             'roc_auc_score_tuned':[log_roc_auc_score,dt_roc_auc_score,rfc_roc_auc_score],
                              'accuracy_default':[log_accuracy_default,dt_accuracy_default,rfc_accuracy_default], 
                             'accuracy_tuned':[log_accuracy,dt_accuracy,rfc_accuracy],
                              
                             },index = ['logit','tree','forest'])

models_scores

    Logistic Regression seems to have better metrics for this dataset