## **RADI605: Modern Machine Learning**

### Assignment: Random Forests
**Romen Samuel Rodis Wabina** <br>
Student, PhD Data Science in Healthcare and Clinical Informatics <br>
Clinical Epidemiology and Biostatistics, Faculty of Medicine (Ramathibodi Hospital) <br>
Mahidol University

Note: In case of Python Markdown errors, you may access the assignment through this GitHub [Link](https://github.com/rrwabina/RADI605/blob/main/05%20Adaptive%20Boosting/scripts/assignment.ipynb)

In [67]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from numpy import mean
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from mlxtend.plotting import plot_learning_curves
from statsmodels.stats.outliers_influence import variance_inflation_factor

import random
import warnings
warnings.filterwarnings('ignore')

In [26]:
def convert_binary(df, columns = ['Risk1Yr']):
    for column in columns:
        df[column] = df[column].apply(lambda x: 0 if x == 'F' else 1)
    return df

def load_thoracic(path = '../data/ThoraricSurgery.csv'):
    data = pd.read_csv(path)
    data = data[data.columns[2:]]
    data = data.drop(['PRE6', 'PRE14'], axis = 1)
    label_columns = data.columns[2:12]
    data = convert_binary(data, columns = ['Risk1Yr'])
    data = convert_binary(data, columns = label_columns)
    include_columns = data.columns[0:-1]
    X, y = data[include_columns], data['Risk1Yr']
    X, y = X.to_numpy(), y.to_numpy()
    y[y == 0] = -1
    return X, y, data

X, y, data = load_thoracic()

When features are highly correlated, they contain redundant information, and the Random Forest may use the same feature in many of the trees, leading to overfitting. Additionally, highly correlated features can cause instability in the feature importance scores and make it difficult to interpret the results.

It's always a good idea to check for highly correlated features before training a Random Forest and remove or combine them as needed. This can be done using techniques such as principal component analysis (PCA) or feature selection. By reducing the number of highly correlated features, you can simplify the model and make it more interpretable, and you may also see an improvement in performance.


In [66]:
data.corr()

Unnamed: 0,PRE4,PRE5,PRE7,PRE8,PRE9,PRE10,PRE11,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
PRE4,1.0,0.032975,0.019786,-0.095827,0.055829,-0.05277,-0.100242,-0.115145,-0.009135,-0.035584,-0.012009,-0.060578,-0.290178,-0.046374
PRE5,0.032975,1.0,0.161615,0.102979,0.260073,-0.099914,-0.086103,-0.022251,-0.013617,-0.025088,-0.100853,-0.016509,-0.1159,-0.042841
PRE7,0.019786,0.161615,1.0,0.256225,0.067529,-0.024115,-0.072455,0.022578,-0.017372,-0.034968,-0.077406,-0.017372,0.044789,0.057375
PRE8,-0.095827,0.102979,0.256225,1.0,0.134386,0.081772,0.060393,-0.001471,-0.026886,0.086156,-0.044942,-0.026886,0.086705,0.065785
PRE9,0.055829,0.260073,0.067529,0.134386,1.0,0.049843,-0.072455,-0.042725,-0.017372,0.097572,-0.077406,-0.017372,-0.015331,0.10553
PRE10,-0.05277,-0.099914,-0.024115,0.081772,0.049843,1.0,0.202245,0.016551,0.044101,0.017815,0.200373,-0.026401,0.149589,0.08886
PRE11,-0.100242,-0.086103,-0.072455,0.060393,-0.072455,0.202245,1.0,0.069522,0.058695,0.029726,0.118527,-0.029161,0.208003,0.086467
PRE17,-0.115145,-0.022251,0.022578,-0.001471,-0.042725,0.016551,0.069522,1.0,-0.018543,0.025328,-0.036906,-0.018543,0.085081,0.108974
PRE19,-0.009135,-0.013617,-0.017372,-0.026886,-0.017372,0.044101,0.058695,-0.018543,1.0,-0.008602,0.030496,-0.004274,-0.03032,-0.027347
PRE25,-0.035584,-0.025088,-0.034968,0.086156,0.097572,0.017815,0.029726,0.025328,-0.008602,1.0,0.061386,-0.008602,0.058112,0.037354


In [28]:
np.random.seed(415)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler  = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# x = np.arange(2)
# plt.bar(x, height = [len(y[y == 1]), len(y[y == -1])])
# plt.xticks(x, ['Cervical Cancer', 'No Cervical Cancer'])
# plt.ylabel('Number of Respondents')
# plt.title('Imbalanced dataset')

In [45]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[75  0]
 [19  0]]
              precision    recall  f1-score   support

          -1       0.80      1.00      0.89        75
           1       0.00      0.00      0.00        19

    accuracy                           0.80        94
   macro avg       0.40      0.50      0.44        94
weighted avg       0.64      0.80      0.71        94



In [46]:
smote = SMOTE(sampling_strategy = 'minority', random_state = 42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [49]:
model = RandomForestClassifier(n_estimators = 1000, random_state = 4, oob_score = True, max_features = 15)
model.fit(X_resampled, y_resampled)
predictions = model.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[66  9]
 [16  3]]
              precision    recall  f1-score   support

          -1       0.80      0.88      0.84        75
           1       0.25      0.16      0.19        19

    accuracy                           0.73        94
   macro avg       0.53      0.52      0.52        94
weighted avg       0.69      0.73      0.71        94



In [51]:
param_grid = { 
                'n_estimators': [10, 50, 100], 
                'criterion': ['gini', 'entropy'],
                'max_depth': np.arange(1, 10),
                'min_samples_split': np.arange(1, 5)
              }
model = RandomForestClassifier()

grid = GridSearchCV(model, param_grid, refit = True)
grid.fit(X_resampled, y_resampled)

print(grid.best_params_)

{'criterion': 'gini', 'max_depth': 9, 'min_samples_split': 4, 'n_estimators': 100}


In [54]:
model = RandomForestClassifier(criterion = 'gini', n_estimators = 100, max_depth = 9, min_samples_split = 4)
model.fit(X_resampled, y_resampled)
yhat = model.predict(X_test)
print(confusion_matrix(y_test, yhat))
print(classification_report(y_test, yhat))

[[55 20]
 [14  5]]
              precision    recall  f1-score   support

          -1       0.80      0.73      0.76        75
           1       0.20      0.26      0.23        19

    accuracy                           0.64        94
   macro avg       0.50      0.50      0.50        94
weighted avg       0.68      0.64      0.66        94



In [64]:
param_grid = { 
                'n_estimators': [10, 50, 100], 
                'criterion': ['gini', 'entropy'],
                'max_depth': np.arange(1, 10),
                'min_samples_split': np.arange(1, 5)
              }
rsearch = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                           param_distributions = param_grid, 
                           cv = 5, n_iter = 10)

rsearch.fit(X_resampled, y_resampled)
print(rsearch.best_params_)

yhat = grid.predict(X_test)
print(confusion_matrix(y_test, yhat))
print(classification_report(y_test, yhat))

{'n_estimators': 100, 'min_samples_split': 4, 'max_depth': 8, 'criterion': 'entropy'}
[[57 18]
 [15  4]]
              precision    recall  f1-score   support

          -1       0.79      0.76      0.78        75
           1       0.18      0.21      0.20        19

    accuracy                           0.65        94
   macro avg       0.49      0.49      0.49        94
weighted avg       0.67      0.65      0.66        94



In [121]:
X, y, data = load_thoracic()

def split_data(X, y, pca_included = False, smote_included = False):
    if pca_included is True:
        pca = PCA(n_components = 10)
        X = pca.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    if smote_included is True:
        smote = SMOTE(sampling_strategy = 'minority', k_neighbors = 5, random_state = 42)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    else:
        X_resampled, y_resampled = X_train, y_train 
    return X_resampled, y_resampled, X_test, y_test

X_resampled, y_resampled, X_test, y_test = split_data(X, y, pca_included = True, smote_included = True)

In [122]:
param_grid = { 
                'n_estimators': [10, 2000], 
                'criterion': ['gini', 'entropy'],
                'max_depth': np.arange(1, 20),
                'min_samples_split': np.arange(1, 5)
              }
rsearch = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                           param_distributions = param_grid, 
                           cv = 10, n_iter = 10)

rsearch.fit(X_resampled, y_resampled)
print(rsearch.best_params_)

yhat = rsearch.predict(X_test)
print(confusion_matrix(y_test, yhat))
print(classification_report(y_test, yhat))

{'n_estimators': 10, 'min_samples_split': 3, 'max_depth': 12, 'criterion': 'entropy'}
[[65 10]
 [16  3]]
              precision    recall  f1-score   support

          -1       0.80      0.87      0.83        75
           1       0.23      0.16      0.19        19

    accuracy                           0.72        94
   macro avg       0.52      0.51      0.51        94
weighted avg       0.69      0.72      0.70        94

