In [None]:
## Lab | Random Forests

    ## For this lab, you will be using the CSV files provided: categorical.csv, numerical.csv, target.csv

    ## Instructions
        ## Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.
        ## Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Transformation and modelling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.metrics import confusion_matrix, cohen_kappa_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

In [6]:
churndata = pd.read_csv("cleaned_churndata.csv")
churndata

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_churndata.csv'

In [None]:
y = churndata.churn.map({'No': 0, 'Yes':1})
y

In [None]:
# Separate numericals and categoricals before encoding
numericals = churndata.select_dtypes(np.number)
numericals

In [None]:
categoricals = churndata.select_dtypes(object)
categoricals

In [None]:
# drop target variable which has already been assigned to y during data gathering
categoricals.drop('churn', axis=1, inplace=True)

In [None]:
cat_dumm = pd.get_dummies(categoricals).astype(int) # instead of having True or False, change it to 0s and 1s
cat_dumm

In [None]:
X = numericals.join(cat_dumm)
X

In [None]:
# Apply SMOTE on the dataset to balance the data
smote = SMOTE()

X_sm, y_sm = smote.fit_resample(X, y)

y_sm.value_counts()

In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, random_state=1, test_size=0.3)

In [None]:
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

In [None]:
# predictions
sm_predict = rf.predict(X_test)

In [None]:
sm_metrics = classification_report(y_test, sm_predict)
print("Classification report for randomforest:\n", sm_metrics)

In [None]:
# Since data imbalance has been taken into account, there would be no problem choosing accuracy as the means of scoring for this cross validation step
scores = cross_val_score(rf, X_train, y_train, cv=20, scoring='accuracy')
scores

In [None]:
print("Standard deviation of Accuracy Scores from 20 CVs: {:.3f}".format(np.std(scores)))
print("Minimum Accuracy: {:.3f}".format(min(scores)))
print("Maximum Accuracy: {:.3f}".format(max(scores)))

In [None]:
# parameters chosen
param_grid = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [5, 20],
    'bootstrap': [True, False],
    'max_depth': [5, 10, 20, 50, None]
    }

# model
model = RandomForestClassifier(random_state = 1)

# grid search
grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True, n_jobs=-1)

grid_search.fit(X_train,y_train)
grid_search.best_params_

In [None]:
rf1 = RandomForestClassifier(random_state=1, bootstrap = True, criterion = 'gini', max_depth = 20, n_estimators = 20)
rf.fit(X_train, y_train)

In [None]:
# predictions
sm_predict1 = rf.predict(X_test)

In [None]:
sm_metrics1 = classification_report(y_test, sm_predict1)
print("Classification report for randomforest:\n", sm_metrics1)