In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler 
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
import warnings 
warnings.filterwarnings('ignore')

In this exercise we will go back to the customer churn data from the last lab (the dataset can also be found in the files_for_activities folder)

Implement cross validation along with logistic regression and decision tree classifier on the data

Create a pipeline as shown in the class example
Note: you can directly use the upsampled data from SMOTE technique. You can use the following code to set up for this activity

In [9]:
churnData = pd.read_csv('files_for_lab/Customer-Churn.csv')
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce') # turn to numeric
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.mean(churnData['TotalCharges'])) # fill with mean

In [39]:
smote = SMOTE() 
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges']] 
transformer = StandardScaler().fit(X) 
X = transformer.transform(X) 
y = churnData['Churn'] 
X_sm, y_sm = smote.fit_resample(X, y) # You can actually pass 2 variables
print(y_sm.value_counts())
print('\n')

## Create the models
lgr = LogisticRegression()
dtc = DecisionTreeClassifier()
models = [lgr, dtc]
names = ['Logistic Regression', 'Decision Tree']

## Cross validate score
for i, model in enumerate(models):
    scores = cross_val_score(model, X_sm, y_sm, cv=20)
    print(f'Mean for {names[i]}: {scores.mean()}')
    print(f'Standard deviation for {names[i]}: {scores.std()}')

Yes    5174
No     5174
Name: Churn, dtype: int64


Mean for Logistic Regression: 0.7342553938298619
Standard deviation for Logistic Regression: 0.017466354453190613
Mean for Decision Tree: 0.7779569912548636
Standard deviation for Decision Tree: 0.03627206347818827


In [38]:
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges']] 
transformer = StandardScaler().fit(X) 
X = transformer.transform(X) 
y = churnData['Churn'] 
rus = RandomUnderSampler(random_state=30)
X_res, y_res = rus.fit_resample(X, y) # You can actually pass 2 variables
print(y_res.value_counts())
print('\n')

## Create the models
lgr = LogisticRegression()
dtc = DecisionTreeClassifier()
models = [lgr, dtc]
names = ['Logistic Regression', 'Decision Tree']

## Cross validate score
for i, model in enumerate(models):
    scores = cross_val_score(model, X_res, y_res, cv=20)
    print(f'Mean for {names[i]}: {scores.mean()}')
    print(f'Standard deviation for {names[i]}: {scores.std()}')

No     1869
Yes    1869
Name: Churn, dtype: int64


Mean for Logistic Regression: 0.736764705882353
Standard deviation for Logistic Regression: 0.03201107702883474
Mean for Decision Tree: 0.6661218446322812
Standard deviation for Decision Tree: 0.03329976741189365
