# Random Forest - Detection of Swallowing Disorders

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas import ExcelWriter
import os
import numpy as np
import random
plt.style.use("seaborn")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

### Loading the dataset

In [2]:
healthy = pd.read_excel("D:/DATA SCIENCE/INTERNSHIP PROJECT/Modelling/HealthySamples.xlsx")
healthy = healthy[healthy["Sample status"] == "Perfect"]
healthy["Swallow_type"] = "healthy"
unhealthy = pd.read_excel("D:/DATA SCIENCE/INTERNSHIP PROJECT/Modelling/UnHealthySamples.xlsx")
unhealthy = unhealthy[unhealthy["Sample status"] == "Perfect"]
unhealthy["Swallow_type"] = "unhealthy"
data = healthy.append(unhealthy)[["Patient", "Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_start", "UES_end", "UES_Duration", "Swallow_type"]]
data["Label"] = np.where(data["Swallow_type"] == "healthy", 0, 1)
data

Unnamed: 0,Patient,Sensor1_max,Sensor2_max,Sensor3_min,Latency,UES_start,UES_end,UES_Duration,Swallow_type,Label
0,Control 1.txt,192.279698,113.474479,-20.601968,-0.028,1552.752,1553.392,0.640,healthy,0
1,Control 1.txt,167.821012,98.336004,-11.776150,-0.028,1782.184,1782.896,0.712,healthy,0
2,Control 1.txt,171.739528,249.844358,-24.181735,-0.036,611.556,612.284,0.728,healthy,0
3,Control 1.txt,200.377661,107.578393,-16.358434,-0.092,2333.764,2334.412,0.648,healthy,0
4,Control 1.txt,193.675898,105.770199,-25.335317,-0.172,1697.164,1697.784,0.620,healthy,0
...,...,...,...,...,...,...,...,...,...,...
194,HD11A.txt,114.937700,101.913100,-27.384550,0.211,117.157,117.828,0.671,unhealthy,1
195,HD11A.txt,195.568900,97.228870,-14.957330,0.200,163.116,164.111,0.995,unhealthy,1
196,HD11A.txt,200.185000,116.615400,-9.014693,-0.042,124.009,124.538,0.529,unhealthy,1
197,HD11B.txt,209.113400,79.385630,-12.359970,0.192,61.255,61.856,0.601,unhealthy,1


### Correlation matrix of Independent variables

In [3]:
corr = data[["Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_Duration"]].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Sensor1_max,Sensor2_max,Sensor3_min,Latency,UES_Duration
Sensor1_max,1.0,-0.0346022,-0.038871,0.149575,0.0296155
Sensor2_max,-0.0346022,1.0,-0.124292,0.0283056,0.056447
Sensor3_min,-0.038871,-0.124292,1.0,-0.0584012,0.063469
Latency,0.149575,0.0283056,-0.0584012,1.0,0.316552
UES_Duration,0.0296155,0.056447,0.063469,0.316552,1.0


### Training and Test data split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data[["Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_Duration"]], data[["Label"]], test_size=0.3, random_state=0)

### Composition of Training and test data

In [5]:
y_train.apply(pd.Series.value_counts)

Unnamed: 0,Label
0,86
1,84


In [6]:
y_test.apply(pd.Series.value_counts)

Unnamed: 0,Label
0,38
1,36


## Fitting a Random forest Model

In [7]:
model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt')
model.fit(X_train, y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Comparing Predicted and Actual values

In [8]:
y_pred = model.predict(X_test)

In [9]:
pred = pd.DataFrame(y_pred)
pred["Predicted"] = np.where(pred[0] == 0, "Healthy", "Unhealthy")
pred["Actual"] = np.where(y_test["Label"] == 0, "Healthy", "Unhealthy")
pred[["Predicted", "Actual"]]

Unnamed: 0,Predicted,Actual
0,Healthy,Healthy
1,Unhealthy,Healthy
2,Healthy,Healthy
3,Healthy,Healthy
4,Unhealthy,Unhealthy
...,...,...
69,Healthy,Healthy
70,Healthy,Healthy
71,Healthy,Healthy
72,Healthy,Healthy


### Confusion Matrix

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[26 12]
 [11 25]]


<Figure size 504x504 with 0 Axes>

### Model Evaluation metrics

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.68      0.69        38
           1       0.68      0.69      0.68        36

    accuracy                           0.69        74
   macro avg       0.69      0.69      0.69        74
weighted avg       0.69      0.69      0.69        74



# Hyperparameter tuning in Random Forest

## Evaluating best performing parameters

In [12]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
param_grid = {'bootstrap': [True, False],
 'max_depth': [2, 3, 4, 5,10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [13]:
random_search = RandomizedSearchCV(model, param_distributions=param_grid, verbose = 3, cv=6, scoring="roc_auc")

In [14]:
random_search.fit(X_train, y_train)

Fitting 6 folds for each of 10 candidates, totalling 60 fits
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True, score=0.814, total=   3.0s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True, score=0.719, total=   2.9s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True, score=0.643, total=   2.9s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True, score=0.724, total=   3.1s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True, score=0.643, total=   3.2s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=80, bootstrap=True, score=0.617, total=   3.1s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True, score=0.829, total=   2.5s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True, score=0.705, total=   2.4s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True, score=0.633, total=   2.5s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True, score=0.653, total=   2.6s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True, score=0.755, total=   2.9s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=2, bootstrap=True, score=0.617, total=   2.5s
[CV] n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False, score=0.776, total=   0.5s
[CV] n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False, score=0.738, total=   0.5s
[CV] n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False, score=0.622, total=   0.5s
[CV] n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False, score=0.740, total=   0.6s
[CV] n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False, score=0.668, total=   0.6s
[CV] n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=False, score=0.612, total=   0.6s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True, score=0.810, total=   2.8s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True, score=0.767, total=   3.0s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True, score=0.617, total=   3.2s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True, score=0.689, total=   4.0s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True, score=0.658, total=   3.8s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=5, bootstrap=True, score=0.617, total=   2.8s
[CV] n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, score=0.810, total=   3.0s
[CV] n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, score=0.724, total=   2.8s
[CV] n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, score=0.648, total=   2.7s
[CV] n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, score=0.699, total=   2.6s
[CV] n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, score=0.663, total=   2.3s
[CV] n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, score=0.602, total=   2.6s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True, score=0.800, total=   1.4s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True, score=0.729, total=   1.4s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True, score=0.638, total=   1.4s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True, score=0.724, total=   1.4s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True, score=0.673, total=   1.5s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True, score=0.587, total=   1.4s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, score=0.819, total=   0.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, score=0.719, total=   0.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, score=0.607, total=   0.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, score=0.699, total=   0.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, score=0.668, total=   0.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, score=0.612, total=   0.6s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False, score=0.767, total=   1.2s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False, score=0.738, total=   1.1s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False, score=0.628, total=   1.2s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False, score=0.745, total=   1.2s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False, score=0.658, total=   1.4s
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1000, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=50, bootstrap=False, score=0.612, total=   1.5s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True, score=0.800, total=   0.4s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True, score=0.710, total=   0.4s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True, score=0.602, total=   0.3s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True, score=0.628, total=   0.3s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True, score=0.714, total=   0.4s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=2, bootstrap=True, score=0.633, total=   0.4s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True, score=0.795, total=   2.5s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True, score=0.743, total=   2.5s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True, score=0.638, total=   2.1s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True, score=0.704, total=   2.1s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True, score=0.658, total=   2.2s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=70, bootstrap=True, score=0.602, total=   2.4s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.8min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=6, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='sqrt',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=100,
                                                    n_jobs=None,
 

In [15]:
random_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1800,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Fitting a Random Forest with best performing parameters

In [16]:
model = random_search.best_estimator_
model.fit(X_train, y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1800,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Comparing Predicted and Actual values

In [17]:
y_pred = model.predict(X_test)

In [18]:
pred = pd.DataFrame(y_pred)
pred["Predicted"] = np.where(pred[0] == 0, "Healthy", "Unhealthy")
pred["Actual"] = np.where(y_test["Label"] == 0, "Healthy", "Unhealthy")
pred[["Predicted", "Actual"]]

Unnamed: 0,Predicted,Actual
0,Unhealthy,Healthy
1,Unhealthy,Healthy
2,Healthy,Healthy
3,Unhealthy,Healthy
4,Unhealthy,Unhealthy
...,...,...
69,Unhealthy,Healthy
70,Healthy,Healthy
71,Unhealthy,Healthy
72,Unhealthy,Healthy


## Recalculating Model Evaluation metrics for Model fitted with best performing parameters

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.53      0.58        38
           1       0.58      0.69      0.63        36

    accuracy                           0.61        74
   macro avg       0.61      0.61      0.61        74
weighted avg       0.61      0.61      0.61        74



In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[20 18]
 [11 25]]


<Figure size 504x504 with 0 Axes>

## Changing threshold to achieve 100% detection of Unhealthy patients

In [68]:
THRESHOLD = 0.39
y_pred = np.where(model.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

## Reevaluating Model Evaluation metrics with new threshold

In [71]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.21      0.35        38
           1       0.55      1.00      0.71        36

    accuracy                           0.59        74
   macro avg       0.77      0.61      0.53        74
weighted avg       0.78      0.59      0.52        74



In [72]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[ 8 30]
 [ 0 36]]


<Figure size 504x504 with 0 Axes>