# Regression & Classification Model Comparisons

## Setup

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
df = pd.read_csv('../data_sets/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [46]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [47]:
# All are X or dependent variables, y = churn = independent

## Exploratory Data Analysis


### Check data for baseline abnormalities

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


#### See that "TotalCharges" is an object, should be numerical. This type of stuff needs to be identified and addressed.


In [49]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

df.info()

#### See that TotalCharges has some null values. These need to be addressed.

In [50]:
# In this case, just drop these columns because there are so few. Not going to impute them.
df.dropna(how='any', inplace=True)

In [51]:
df.info()

# 11 records dropped.

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  

In [52]:
df.Churn.value_counts()/len(df)*100

# percent churn yes/no
# Churn rate = 26.6%

Churn
No     73.421502
Yes    26.578498
Name: count, dtype: float64

# ??? WHAT ELSE CAN BE EXPLORED IN EDA ???

The goal of ML is to get the churn rate down. Create a model in such a way to predict the churn rate. Then work with Sales to decrease the churn rate.

# Model Building

## Setup

In [53]:
X=df.drop(['customerID', 'Churn'], axis=1)
y=df.Churn.values

# create X and y variables, remove unneeded columns for a faster model.

## Feature Encoding

In [54]:
# dummy encoding
X.columns
# get the columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [55]:
# remove SeniorCitizen, tenure, etc. that do not need to be encoded

X = pd.get_dummies(X, columns=['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

In [56]:
X.head(1)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


## Train-Test Split

In [57]:
# Splitting the data into train-test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [58]:
len(X_train)

5274

In [59]:
len(X_test)

1758

## Feature Scaling

In [60]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

# Fit transform on training data
# only transform on test data

In [61]:
X_train_sc

array([[-0.44017364,  0.1083141 , -1.45377843, ..., -0.52982902,
        -0.71769205,  1.84953722],
       [ 2.27183075,  0.2306808 ,  0.62148395, ..., -0.52982902,
         1.39335528, -0.54067579],
       [-0.44017364,  1.00566993, -0.16734874, ..., -0.52982902,
         1.39335528, -0.54067579],
       ...,
       [ 2.27183075,  0.59778091,  0.5183033 , ..., -0.52982902,
         1.39335528, -0.54067579],
       [-0.44017364,  0.43462531, -0.28717143, ..., -0.52982902,
         1.39335528, -0.54067579],
       [-0.44017364,  0.80172542,  1.54678137, ...,  1.88740132,
        -0.71769205, -0.54067579]], shape=(5274, 30))

## kNN Classifier

In [62]:
# Call the kNN Classifier
from sklearn.neighbors import KNeighborsClassifier

# Initiating the classifier
model = KNeighborsClassifier()

# Passing the data to the classifier
model.fit(X_train_sc, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [63]:
y_pred = model.predict(X_test_sc)

In [64]:
y_pred

array(['No', 'No', 'No', ..., 'No', 'No', 'No'],
      shape=(1758,), dtype=object)

In [65]:
y_test

array(['No', 'No', 'No', ..., 'No', 'No', 'Yes'],
      shape=(1758,), dtype=object)

In [66]:
# Check accuracy
# Classification metrics = to check how the model is behaving

from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred)*100)
## the kNN classifier model is 77% accurate

76.16609783845279


## Test new data prediction

In [67]:
X_test_sc

array([[-0.44017364,  0.149103  ,  0.99925825, ...,  1.88740132,
        -0.71769205, -0.54067579],
       [-0.44017364,  1.20961443,  0.70968676, ..., -0.52982902,
         1.39335528, -0.54067579],
       [-0.44017364,  0.80172542,  1.18232069, ..., -0.52982902,
         1.39335528, -0.54067579],
       ...,
       [-0.44017364,  1.04645883,  0.0157137 , ...,  1.88740132,
        -0.71769205, -0.54067579],
       [ 2.27183075,  1.25040333,  1.48520582, ..., -0.52982902,
        -0.71769205, -0.54067579],
       [ 2.27183075, -0.99298623,  0.90939124, ..., -0.52982902,
         1.39335528, -0.54067579]], shape=(1758, 30))

In [68]:
X_test.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [69]:
data = [[0,2,87,178,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1]]
data_sc = sc.transform(data)
single = model.predict(data_sc)
print(single)
# YES churn prediction

['Yes']




In [70]:
data_sc = sc.transform(data)
single = model.predict_proba(data_sc)
print(single)

[[0.2 0.8]]




## Decision Tree Classifier

In [75]:
# Call the classifier
from sklearn.tree import DecisionTreeClassifier

# Initiate teh classifier
model_dt = DecisionTreeClassifier()

# Pass the data to the classifier
model_dt.fit(X_train_sc, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [76]:
y_pred_dt = model_dt.predict(X_test_sc)

In [77]:
y_pred_dt

array(['No', 'No', 'No', ..., 'No', 'No', 'Yes'],
      shape=(1758,), dtype=object)

In [78]:
# Classification metrocs = to check how the model is behaving

# from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_pred_dt)*100)

72.46871444823664


In [None]:
# for this test, kNN > DT

## Random Forest Classifier

In [None]:
# call the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Initiate the classifier
model_rf = RandomForestClassifier(n_estimators=200) # 200 trees

# Pass the data to teh classifier
model_rf.fit(X_train_sc, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [81]:
y_pred_rf = model_rf.predict(X_test_sc)

In [82]:
# Classification metrics = check how the model is behaving
# from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred_rf)*100)

79.18088737201366


In [83]:
# for this test, RF > kNN > DT

### Recall and Precision Score, Classification Report

In [94]:
from sklearn.metrics import recall_score, precision_score, classification_report

knn_recall = recall_score(y_test, y_pred, pos_label='Yes')
dt_recall = recall_score(y_test, y_pred_dt, pos_label='Yes')
rf_recall = recall_score(y_test, y_pred_rf,pos_label='Yes')

# Will error because these values need to be converted into 1's and 0's instead of Yes/No this is what the pos_label is for

In [88]:
knn_report = classification_report(y_test, y_pred)
dt_report = classification_report(y_test, y_pred_dt)
rf_report = classification_report(y_test, y_pred_rf)

In [95]:
print(knn_report)

              precision    recall  f1-score   support

          No       0.82      0.86      0.84      1277
         Yes       0.57      0.51      0.54       481

    accuracy                           0.76      1758
   macro avg       0.70      0.68      0.69      1758
weighted avg       0.75      0.76      0.76      1758



In [None]:
print(knn_recall) # same answer as the "Yes" for recall table above

0.5093555093555093


In [97]:
print('dt report', dt_report)

dt report               precision    recall  f1-score   support

          No       0.81      0.82      0.81      1277
         Yes       0.50      0.48      0.49       481

    accuracy                           0.72      1758
   macro avg       0.65      0.65      0.65      1758
weighted avg       0.72      0.72      0.72      1758



In [98]:
# low precision and accuracy because the data is imbalanced. there are 73% active users
# vs. 27% churn users, so it overlearned on active users.

In [None]:
# In this case, accuracy is basically useless, because 73% of users are active, 
# so a prediction that 73% of users will not churn it is just stating the obvious.

# The model learned, “Most people don’t churn → predict No” This is not overfitting, but 'class prior bias'.

|Classifiers|Accuracy|Recall|Precision|f1|ROC AUC|
|---|---|---|---|---|---|
|kNN|77|52|56|54||
|decision tree|74|55|50|52||
|random forest|80|52|66|58||

### ROC AUC