In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as py
%matplotlib inline
pd.options.display.max_columns = None

## Data Preprocessing

In [20]:
telco = pd.read_csv("telco_customer_churn.csv")
#first few rows
telco.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [21]:
# Replace ' ' to null valuess and drop data point with null values
telco["TotalCharges"] = telco["TotalCharges"].replace(' ', np.nan)
telco.dropna(inplace=True)

# Change TotalCharges column type to float
telco["TotalCharges"] = telco["TotalCharges"].astype(float)
telco = telco.reset_index()[telco.columns]

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

num_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
cat_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
target = 'Churn'

def transform(df):
    # Nominal Data preprocessing with dummy variables
    dfNom = pd.get_dummies(df[cat_features])
    
    # Numerical Data preprocessing with StandardScaler
    # mean = 0 std = 1
    scaler = StandardScaler()
    dfNum = pd.DataFrame(scaler.fit_transform(df[num_features]),columns=num_features)
    
    # Transform target data with LabelEncoder
    le = LabelEncoder()
    dfY = pd.DataFrame(le.fit_transform(df[target]),columns=[target])
    
    # Concat all the dataframe
    output = pd.concat([dfNom,dfNum,dfY],axis='columns')
    return output

In [23]:
telco = transform(telco)
telco.head()

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges,Churn
0,0,1,0,0,1,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,-1.280248,-1.161694,-0.994194,0
1,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0.064303,-0.260878,-0.17374,0
2,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,-1.239504,-0.363923,-0.959649,1
3,0,0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0.512486,-0.74785,-0.195248,0
4,0,1,0,1,0,1,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,-1.239504,0.196178,-0.940457,1


In [28]:
# Split train and test set
from sklearn.model_selection import train_test_split
X = telco.drop("Churn", axis=1)
y = telco["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 33)

## Model Prediction

In [35]:
# k-Nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [36]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = KNeighborsClassifier()
lr.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [39]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dct = KNeighborsClassifier()
dct.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [43]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [44]:
# Accuracy score
print("knn Test set score: {}".format(knn.score(X_train, y_train)))
print("lr Test set score: {}".format(lr.score(X_train, y_train)))
print("dct Test set score: {}".format(dct.score(X_train, y_train)))
print("rf Test set score: {}".format(rf.score(X_train, y_train)))

knn Test set score: 0.8420553659461509
lr Test set score: 0.8420553659461509
dct Test set score: 0.8420553659461509
rf Test set score: 0.9979142965491088


In [42]:
# Accuracy score
print("knn Test set score: {}".format(knn.score(X_test, y_test)))
print("lr Test set score: {}".format(lr.score(X_test, y_test)))
print("dct Test set score: {}".format(dct.score(X_test, y_test)))
print("rf Test set score: {}".format(rf.score(X_test, y_test)))

knn Test set score: 0.7639362912400455
lr Test set score: 0.7639362912400455
dct Test set score: 0.7639362912400455
rf Test set score: 0.7861205915813424
