# Lab | Random Forests

- Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.
- Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

## Import Libraries

In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Transformation and modelling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.metrics import confusion_matrix, cohen_kappa_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

## Load Dataset

> The dataset loaded has previously been cleaned so we will move on to encoding and performing SMOTE afterwards

In [144]:
churndata = pd.read_csv("cleaned_churndata.csv")
churndata

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.50,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.50,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.90,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.60,Yes


In [145]:
y = churndata.churn.map({'No': 0, 'Yes':1})
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: churn, Length: 7043, dtype: int64

### Encode Categoricals

In [162]:
# Separate numericals and categoricals before encoding
numericals = churndata.select_dtypes(np.number)
numericals

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges
0,0,1,29.85,29.85
1,0,34,56.95,1889.50
2,0,2,53.85,108.15
3,0,45,42.30,1840.75
4,0,2,70.70,151.65
...,...,...,...,...
7038,0,24,84.80,1990.50
7039,0,72,103.20,7362.90
7040,0,11,29.60,346.45
7041,1,4,74.40,306.60


In [163]:
categoricals = churndata.select_dtypes(object)
categoricals

Unnamed: 0,gender,partner,dependents,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,churn
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month,No
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year,No
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month,Yes
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year,No
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,No
7039,Female,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,One year,No
7040,Female,Yes,Yes,No,Yes,No,No,No,No,No,Month-to-month,No
7041,Male,Yes,No,Yes,No,No,No,No,No,No,Month-to-month,Yes


In [165]:
# drop target variable which has already been assigned to y during data gathering
categoricals.drop('churn', axis=1, inplace=True)

In [148]:
cat_dumm = pd.get_dummies(categoricals).astype(int) # instead of having True or False, change it to 0s and 1s
cat_dumm

Unnamed: 0,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,phoneservice_No,phoneservice_Yes,onlinesecurity_No,onlinesecurity_No internet service,...,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,1,0,0,1,1,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
1,0,1,1,0,1,0,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
2,0,1,1,0,1,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0
3,0,1,1,0,1,0,1,0,0,0,...,1,1,0,0,1,0,0,0,1,0
4,1,0,1,0,1,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,0,1,0,1,0,1,0,0,...,1,0,0,1,0,0,1,0,1,0
7039,1,0,0,1,0,1,0,1,1,0,...,0,0,0,1,0,0,1,0,1,0
7040,1,0,0,1,0,1,1,0,0,0,...,0,1,0,0,1,0,0,1,0,0
7041,0,1,0,1,1,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0


In [150]:
X = numericals.join(cat_dumm)
X

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,...,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,0,1,29.85,29.85,1,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
1,0,34,56.95,1889.50,0,1,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
2,0,2,53.85,108.15,0,1,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
3,0,45,42.30,1840.75,0,1,1,0,1,0,...,1,1,0,0,1,0,0,0,1,0
4,0,2,70.70,151.65,1,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,0,1,0,1,0,1,...,1,0,0,1,0,0,1,0,1,0
7039,0,72,103.20,7362.90,1,0,0,1,0,1,...,0,0,0,1,0,0,1,0,1,0
7040,0,11,29.60,346.45,1,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
7041,1,4,74.40,306.60,0,1,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0


### SMOTE

In [166]:
# Apply SMOTE on the dataset to balance the data
smote = SMOTE()

X_sm, y_sm = smote.fit_resample(X, y)

y_sm.value_counts()

churn
0    5174
1    5174
Name: count, dtype: int64

### Train-Test Split

In [167]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, random_state=1, test_size=0.3)

## Model

In [168]:
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

In [169]:
# predictions
sm_predict = rf.predict(X_test)

In [170]:
sm_metrics = classification_report(y_test, sm_predict)
print("Classification report for randomforest:\n", sm_metrics)

Classification report for randomforest:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85      1582
           1       0.85      0.84      0.84      1523

    accuracy                           0.85      3105
   macro avg       0.85      0.85      0.85      3105
weighted avg       0.85      0.85      0.85      3105



> **Note:**
> - Without hyperparameter tuning, we get a relatively good overall model performance as seen above.
> - To practice some of the concepts we've learned, cross validation as well as gridsearch will also be performed below:

### Cross Validation

In [175]:
# Since data imbalance has been taken into account, there would be no problem choosing accuracy as the means of scoring for this cross validation step
scores = cross_val_score(rf, X_train, y_train, cv=20, scoring='accuracy')
scores

array([0.84848485, 0.85399449, 0.83746556, 0.81491713, 0.85359116,
       0.88121547, 0.85635359, 0.82872928, 0.88950276, 0.84530387,
       0.83425414, 0.86464088, 0.81491713, 0.8839779 , 0.87292818,
       0.85359116, 0.81491713, 0.87016575, 0.85635359, 0.86187845])

In [176]:
print("Standard deviation of Accuracy Scores from 20 CVs: {:.3f}".format(np.std(scores)))
print("Minimum Accuracy: {:.3f}".format(min(scores)))
print("Maximum Accuracy: {:.3f}".format(max(scores)))

Standard deviation of Accuracy Scores from 20 CVs: 0.022
Minimum Accuracy: 0.815
Maximum Accuracy: 0.890


### Grid Search

In [177]:
# parameters chosen
param_grid = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [5, 20],
    'bootstrap': [True, False],
    'max_depth': [5, 10, 20, 50, None]
    }

# model
model = RandomForestClassifier(random_state = 1)

# grid search
grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True, n_jobs=-1)

grid_search.fit(X_train,y_train)
grid_search.best_params_

{'bootstrap': True, 'criterion': 'gini', 'max_depth': 20, 'n_estimators': 20}

In [178]:
rf1 = RandomForestClassifier(random_state=1, bootstrap = True, criterion = 'gini', max_depth = 20, n_estimators = 20)
rf.fit(X_train, y_train)

In [179]:
# predictions
sm_predict1 = rf.predict(X_test)

In [180]:
sm_metrics1 = classification_report(y_test, sm_predict1)
print("Classification report for randomforest:\n", sm_metrics1)

Classification report for randomforest:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85      1582
           1       0.85      0.84      0.84      1523

    accuracy                           0.85      3105
   macro avg       0.85      0.85      0.85      3105
weighted avg       0.85      0.85      0.85      3105



> **Observation:**
> - After setting the model to take into account the best parameters from the grid search, we get the exact same metric scores as applying the model without tuning.