### Cross Validation

#### This lab is based on the previous "Handling Data Imbalance". It starts in line 39

In [None]:
# Import libraries and Load Data

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, RocCurveDisplay,classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings


In [9]:
df = pd.read_csv("C:/Users/cprieto/Customer-Churn.csv")

In [10]:
df.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [11]:
df.shape

(7043, 16)

In [12]:
# Check the datatypes of all the columns in the data
# You would see that the column TotalCharges is object type. Convert this column into numeric type using pd.to_numeric function.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [13]:
# Check for null values in the dataframe. Replace the null values. 
# We need to replace voids with Nans in order to change Total Charges to numeric, otherwise it gives error
df.replace(' ', np.NaN, inplace=True)

In [14]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors="ignore",downcast="float")

In [15]:
df["TotalCharges"].dtype

dtype('float32')

In [16]:
df.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [17]:
# We´ve got some Nans that we now need to replace because LosgisticRegression method would display error.
# As Nans is less 1% we can replace them by the mean
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].mean())

In [18]:
# we check
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [12]:
#Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:

#    Scale the features either by using normalizer or a standard scaler.
#    Split the data into a training set and a test set.
#    Fit a logistic regression model on the training data.
#    Check the accuracy on the test data.


In [19]:
# Data: 
# Features -> X =tenure, SeniorCitizen, MonthlyCharges and TotalCharges
# Target variable -> y = Churn

X = df[["tenure", "SeniorCitizen", "MonthlyCharges","TotalCharges"]]
y = df["Churn"]

In [20]:
# We split data Train-test

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)



In [21]:
#  Scale the features either by using normalizer or a standard scaler.

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
# we check the imbalance
y.value_counts()/len(y)


No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

In [23]:
y_train.value_counts()/len(y_train)

No     0.733582
Yes    0.266418
Name: Churn, dtype: float64

In [24]:
y_test.value_counts()/len(y_test)

No     0.738822
Yes    0.261178
Name: Churn, dtype: float64

In [None]:
# the imbalance in the Churn is preserved by the split


In [25]:
# Fit a logistic regression model on the training data.

model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression()

In [26]:
# Check the accuracy on the test data
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print(classification_report(y_test,pred_test))
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1041
         Yes       0.62      0.46      0.53       368

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409

              precision    recall  f1-score   support

          No       0.82      0.91      0.86      4133
         Yes       0.65      0.46      0.54      1501

    accuracy                           0.79      5634
   macro avg       0.74      0.68      0.70      5634
weighted avg       0.78      0.79      0.78      5634



In [22]:
# We can see the model is learning much better in the No´s that in the Yes due to imbalance


In [23]:
# Managing imbalance in the dataset
#    Check for the imbalance.
#    Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
#    Each time fit the model and see how the accuracy of the model is.


In [27]:
# we check the imbalance
y.value_counts()/len(y)

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

In [28]:
# Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
# We start with upsampling SMOTE technique

sm = SMOTE(k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train,y_train)

In [29]:
# We check imbalance correction
y_train_SMOTE.value_counts()/len(y_train_SMOTE)

No     0.5
Yes    0.5
Name: Churn, dtype: float64

In [30]:
# We apply the model
model = LogisticRegression()
model.fit(X_train_SMOTE,y_train_SMOTE)

LogisticRegression()

In [31]:
pred_train_SMOTE = model.predict(X_train_SMOTE)
pred_test_SMOTE = model.predict(X_test)
print(classification_report(y_test,pred_test_SMOTE))
print(classification_report(y_train_SMOTE,pred_train_SMOTE))

              precision    recall  f1-score   support

          No       0.88      0.71      0.79      1041
         Yes       0.47      0.72      0.57       368

    accuracy                           0.71      1409
   macro avg       0.67      0.71      0.68      1409
weighted avg       0.77      0.71      0.73      1409

              precision    recall  f1-score   support

          No       0.74      0.73      0.73      4133
         Yes       0.73      0.74      0.73      4133

    accuracy                           0.73      8266
   macro avg       0.73      0.73      0.73      8266
weighted avg       0.73      0.73      0.73      8266



In [32]:
# Accuracy of the model has gone down due to the balancing. This is a much more accurate prediction of what the model can actually predict
# as the imbalanced one is biased


In [32]:
# Lets go now with downsampling technique
# TOMEKLINK

ts = TomekLinks(sampling_strategy='all')
X_train_TS, y_train_TS = ts.fit_resample(X_train, y_train)


In [33]:
# We check imbalance

print(X_train.shape)
print(X_train_TS.shape)

(5634, 4)
(4826, 4)


In [34]:
model = LogisticRegression()
model.fit(X_train_TS,y_train_TS)

LogisticRegression()

In [35]:
pred_test_TS = model.predict(X_test)
pred_train_TS = model.predict(X_train_TS)

print(classification_report(y_test,pred_test_TS))
print(classification_report(y_train_TS,pred_train_TS))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1041
         Yes       0.62      0.46      0.53       368

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409

              precision    recall  f1-score   support

          No       0.87      0.93      0.90      3729
         Yes       0.69      0.52      0.60      1097

    accuracy                           0.84      4826
   macro avg       0.78      0.73      0.75      4826
weighted avg       0.83      0.84      0.83      4826



In [1]:
# Accuracy is better using TomekLinks. The model has improved using the imbalance correction

## Lab_Cross Validation

In [3]:
# 1.Apply SMOTE for upsampling the data

#    Use logistic regression to fit the model and compute the accuracy of the model.



In [None]:
# We have use SMOTE before on this data set

In [None]:
# Logistic regresion without SMOTE


In [37]:
model = LogisticRegression()
model.fit(X_train,y_train)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print(classification_report(y_test,pred_test))
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1041
         Yes       0.62      0.46      0.53       368

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409

              precision    recall  f1-score   support

          No       0.82      0.91      0.86      4133
         Yes       0.65      0.46      0.54      1501

    accuracy                           0.79      5634
   macro avg       0.74      0.68      0.70      5634
weighted avg       0.78      0.79      0.78      5634



In [38]:
# Logisitic Regression With SMOTE

In [42]:
model_log = LogisticRegression()
model_log.fit(X_train_SMOTE,y_train_SMOTE)

LogisticRegression()

In [43]:
pred_train_SMOTE = model_log.predict(X_train_SMOTE)
pred_test_SMOTE = model_log.predict(X_test)
print(classification_report(y_test,pred_test_SMOTE))
print(classification_report(y_train_SMOTE,pred_train_SMOTE))

              precision    recall  f1-score   support

          No       0.88      0.71      0.79      1041
         Yes       0.47      0.72      0.57       368

    accuracy                           0.71      1409
   macro avg       0.67      0.71      0.68      1409
weighted avg       0.77      0.71      0.73      1409

              precision    recall  f1-score   support

          No       0.74      0.73      0.73      4133
         Yes       0.73      0.74      0.73      4133

    accuracy                           0.73      8266
   macro avg       0.73      0.73      0.73      8266
weighted avg       0.73      0.73      0.73      8266



In [47]:
# Use decision tree classifier to fit the model and compute the accuracy of the model

from sklearn.tree import DecisionTreeClassifier

In [48]:
model_dctree = DecisionTreeClassifier(min_samples_leaf=20)
model_dctree.fit(X_train_SMOTE,y_train_SMOTE)

DecisionTreeClassifier(min_samples_leaf=20)

In [50]:
pred_train_SMOTE_dctree = model_dctree.predict(X_train_SMOTE)
pred_test_SMOTE_dctree = model_dctree.predict(X_test)
print(classification_report(y_test,pred_test_SMOTE_dctree))
print(classification_report(y_train_SMOTE,pred_train_SMOTE_dctree))

              precision    recall  f1-score   support

          No       0.86      0.75      0.80      1041
         Yes       0.47      0.64      0.55       368

    accuracy                           0.72      1409
   macro avg       0.67      0.70      0.67      1409
weighted avg       0.76      0.72      0.73      1409

              precision    recall  f1-score   support

          No       0.81      0.80      0.80      4133
         Yes       0.80      0.81      0.81      4133

    accuracy                           0.80      8266
   macro avg       0.80      0.80      0.80      8266
weighted avg       0.80      0.80      0.80      8266



In [51]:
#    Compare the accuracies of the two models.
# Logistic regression provides lower accuracy but less overfitting.
# Both methods still learn "No" values much better than "Yes". 
# SMOTE doesn´t seem to fix the umbalance issue

In [None]:
# 2. Apply TomekLinks for downsampling

In [52]:
# Logisitic Regression With Tomeklinks

model_log_T = LogisticRegression()
model_log_T.fit(X_train_TS,y_train_TS)

LogisticRegression()

In [53]:
pred_test_TS = model_log_T.predict(X_test)
pred_train_TS = model_log_T.predict(X_train_TS)

print(classification_report(y_test,pred_test_TS))
print(classification_report(y_train_TS,pred_train_TS))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1041
         Yes       0.62      0.46      0.53       368

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409

              precision    recall  f1-score   support

          No       0.87      0.93      0.90      3729
         Yes       0.69      0.52      0.60      1097

    accuracy                           0.84      4826
   macro avg       0.78      0.73      0.75      4826
weighted avg       0.83      0.84      0.83      4826



In [None]:
# DecisionTrees With Tomeklinks

In [54]:
model_dctree_T = DecisionTreeClassifier(min_samples_leaf=20)
model_dctree_T.fit(X_train_TS,y_train_TS)

DecisionTreeClassifier(min_samples_leaf=20)

In [55]:
pred_train_TS_dctree = model_dctree_T.predict(X_train_TS)
pred_test_TS_dctree = model_dctree_T.predict(X_test)

print(classification_report(y_test,pred_test_TS_dctree))
print(classification_report(y_train_TS,pred_train_TS_dctree))

              precision    recall  f1-score   support

          No       0.81      0.91      0.86      1041
         Yes       0.61      0.40      0.49       368

    accuracy                           0.78      1409
   macro avg       0.71      0.66      0.67      1409
weighted avg       0.76      0.78      0.76      1409

              precision    recall  f1-score   support

          No       0.88      0.95      0.92      3729
         Yes       0.77      0.58      0.66      1097

    accuracy                           0.86      4826
   macro avg       0.83      0.76      0.79      4826
weighted avg       0.86      0.86      0.86      4826



In [None]:
#    Compare the accuracies of the two models.
#  Both models still learn much better the No Category
#  Overfitting is low in both ML methods, this are good news as Decision Trees tend to overfit
#  Accuracy is quite similar with both ML methods
#  TOmekLinks provides better results than SMOTE overall