# Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Prepare data for model building

In [2]:
df = pd.read_csv('../../data_sets/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
# TotalCharges is an object, however should be a number
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
# shows only 11 records null for TotalCharges, so just get rid of them

# drop any null values
df.dropna(how='any', inplace=True)

In [8]:
df.Churn.value_counts()/len(df)*100
# see that 73% stay, 27% churn

Churn
No     73.421502
Yes    26.578498
Name: count, dtype: float64

In [9]:
# Divide data into x and y variables. y = churn

X = df.drop(['customerID', 'Churn'], axis=1)
y = df.Churn.values

In [10]:
# Feature Encoding: Convert categorical features into numericals
X.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [11]:
X = pd.get_dummies(X, columns=['gender', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

In [12]:
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,True,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False


In [13]:
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)

In [14]:
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
1,0,34,56.95,1889.5,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0,45,42.3,1840.75,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0


In [15]:
# Divide data into Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [16]:
# Feature Scaling: Standardation and Normalation

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [17]:
X_train_sc

array([[ 2.27341616,  1.53020458,  0.84910037, ...,  1.92695011,
        -0.7128463 , -0.54916251],
       [ 2.27341616,  0.83787706,  1.30801175, ..., -0.5189548 ,
         1.40282696, -0.54916251],
       [-0.43986667,  0.96005251,  0.99541994, ..., -0.5189548 ,
         1.40282696, -0.54916251],
       ...,
       [-0.43986667,  1.32657884,  0.67617724, ..., -0.5189548 ,
        -0.7128463 , -0.54916251],
       [-0.43986667, -1.15765519, -0.25827274, ..., -0.5189548 ,
         1.40282696, -0.54916251],
       [-0.43986667,  1.24512854, -1.48203643, ..., -0.5189548 ,
        -0.7128463 , -0.54916251]], shape=(5274, 30))

## kNN Classifier

In [18]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(X_train_sc, y_train)

KNeighborsClassifier()

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [19]:
y_pred = model.predict(X_test_sc)

In [20]:
y_pred

array(['No', 'No', 'No', ..., 'Yes', 'No', 'Yes'],
      shape=(1758,), dtype=object)

In [21]:
y_test

array(['No', 'No', 'Yes', ..., 'Yes', 'No', 'No'],
      shape=(1758,), dtype=object)

In [22]:
# calculate accuracy. basic model, no parameter optimization

from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred)*100)

75.76791808873719


### Making predictions from new data

In [23]:
X_test

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
5451,0,71,95.65,6856.95,0,1,0,1,0,1,...,0,1,0,0,1,0,0,0,0,1
3151,0,15,75.10,1151.55,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1428,0,1,50.45,50.45,1,0,0,1,0,1,...,0,0,0,0,0,0,1,0,1,0
2346,0,71,95.75,6849.40,0,1,1,1,0,1,...,0,0,0,1,0,1,1,1,0,0
4485,0,66,91.70,6075.90,1,1,0,1,0,1,...,0,1,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5929,1,62,89.10,5618.30,1,1,0,1,0,1,...,0,0,0,0,0,0,1,0,1,0
152,0,70,108.15,7930.55,0,1,1,1,0,1,...,0,1,0,1,1,0,1,1,0,0
2900,1,1,69.25,69.25,0,1,1,1,0,0,...,0,0,0,0,0,0,1,0,1,0
132,0,67,50.55,3260.10,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [24]:
data = [[0,2,87,178,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1]]
data_sc = sc.transform(data)
single = model.predict(data_sc)
print(single)

['Yes']




In [25]:
# probability prediction
probability = model.predict_proba(data_sc)
print(probability)

# 80% chance of customer churning

[[0.2 0.8]]


## Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()

model_dt.fit(X_train_sc, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [27]:
y_pred_dt = model_dt.predict(X_test_sc)

In [28]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_pred_dt)*100)

# scores worse than kNN without any hyperparameter optimization

72.75312855517633


## SVM

In [29]:
from sklearn.svm import SVC

# create and train model
model_svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)  
model_svm.fit(X_train_sc, y_train)

# predict
y_pred_svm = model_svm.predict(X_test_sc)

# evaluate
accuracy = accuracy_score(y_test, y_pred_svm) * 100
print(f"SVM Accuracy: {accuracy:.2f}%")


SVM Accuracy: 79.92%


## Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=200)

model_rf.fit(X_train_sc, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [44]:
y_pred_rf = model_rf.predict(X_test_sc)

In [45]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_pred_rf)*100)

79.57906712172924
