# Regression & Classification Model Comparisons

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data_sets/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# All are X or dependent variables, y = churn = independent

## Exploratory Data Analysis


### Check data for baseline abnormalities

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


#### See that "TotalCharges" is an object, should be numerical. This type of stuff needs to be identified and addressed.


In [6]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

df.info()

#### See that TotalCharges has some null values. These need to be addressed.

In [7]:
# In this case, just drop these columns because there are so few. Not going to impute them.
df.dropna(how='any', inplace=True)

In [8]:
df.info()

# 11 records dropped.

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  

In [9]:
df.Churn.value_counts()/len(df)*100

# percent churn yes/no
# Churn rate = 26.6%

Churn
No     73.421502
Yes    26.578498
Name: count, dtype: float64

# ??? WHAT ELSE CAN BE EXPLORED IN EDA ???

The goal of ML is to get the churn rate down. Create a model in such a way to predict the churn rate. Then work with Sales to decrease the churn rate.

# Model Building

## Setup

In [None]:
X=df.drop(['customerID', 'Churn'], axis=1)
y=df.Churn.values

# create X and y variables, remove unneeded columns for a faster model.

## Feature Encoding

In [None]:
# dummy encoding
X.columns
# get the columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [18]:
# remove SeniorCitizen, tenure, etc. that do not need to be encoded

X = pd.get_dummies(X, columns=['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

In [19]:
X.head(1)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


## Train-Test Split

In [21]:
# Splitting the data into train-test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [22]:
len(X_train)

5274

In [23]:
len(X_test)

1758

## Feature Scaling

In [None]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

# Fit transform on training data
# only transform on test data

In [25]:
X_train_sc

array([[-0.44201385,  0.51956435, -0.74521395, ..., -0.52101451,
        -0.70861565, -0.55033227],
       [-0.44201385,  1.21015497,  1.3151628 , ..., -0.52101451,
         1.41120224, -0.55033227],
       [-0.44201385, -0.37414117,  0.34146863, ..., -0.52101451,
        -0.70861565, -0.55033227],
       ...,
       [ 2.26237253,  1.16953199,  0.57907659, ..., -0.52101451,
         1.41120224, -0.55033227],
       [-0.44201385,  0.56018732, -1.32178713, ..., -0.52101451,
        -0.70861565,  1.81708405],
       [-0.44201385, -0.49601011,  1.02272223, ..., -0.52101451,
        -0.70861565, -0.55033227]], shape=(5274, 30))