# XGBoost Algorithm - Detection of Swallowing Disorders

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas import ExcelWriter
import os
import numpy as np
import random
plt.style.use("seaborn")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost

## Loading the Dataset

In [2]:
healthy = pd.read_excel("D:/DATA SCIENCE/INTERNSHIP PROJECT/Modelling/HealthySamples.xlsx")
healthy = healthy[healthy["Sample status"] == "Perfect"]
healthy["Swallow_type"] = "healthy"
unhealthy = pd.read_excel("D:/DATA SCIENCE/INTERNSHIP PROJECT/Modelling/UnHealthySamples.xlsx")
unhealthy = unhealthy[unhealthy["Sample status"] == "Perfect"]
unhealthy["Swallow_type"] = "unhealthy"
data = healthy.append(unhealthy)[["Patient", "Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_start", "UES_end", "UES_Duration", "Swallow_type"]]
data["Label"] = np.where(data["Swallow_type"] == "healthy", 0, 1)
data

Unnamed: 0,Patient,Sensor1_max,Sensor2_max,Sensor3_min,Latency,UES_start,UES_end,UES_Duration,Swallow_type,Label
0,Control 1.txt,192.279698,113.474479,-20.601968,-0.028,1552.752,1553.392,0.640,healthy,0
1,Control 1.txt,167.821012,98.336004,-11.776150,-0.028,1782.184,1782.896,0.712,healthy,0
2,Control 1.txt,171.739528,249.844358,-24.181735,-0.036,611.556,612.284,0.728,healthy,0
3,Control 1.txt,200.377661,107.578393,-16.358434,-0.092,2333.764,2334.412,0.648,healthy,0
4,Control 1.txt,193.675898,105.770199,-25.335317,-0.172,1697.164,1697.784,0.620,healthy,0
...,...,...,...,...,...,...,...,...,...,...
194,HD11A.txt,114.937700,101.913100,-27.384550,0.211,117.157,117.828,0.671,unhealthy,1
195,HD11A.txt,195.568900,97.228870,-14.957330,0.200,163.116,164.111,0.995,unhealthy,1
196,HD11A.txt,200.185000,116.615400,-9.014693,-0.042,124.009,124.538,0.529,unhealthy,1
197,HD11B.txt,209.113400,79.385630,-12.359970,0.192,61.255,61.856,0.601,unhealthy,1


### Correlation matrix of Independent variables

In [3]:
corr = data[["Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_Duration"]].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Sensor1_max,Sensor2_max,Sensor3_min,Latency,UES_Duration
Sensor1_max,1.0,-0.0346022,-0.038871,0.149575,0.0296155
Sensor2_max,-0.0346022,1.0,-0.124292,0.0283056,0.056447
Sensor3_min,-0.038871,-0.124292,1.0,-0.0584012,0.063469
Latency,0.149575,0.0283056,-0.0584012,1.0,0.316552
UES_Duration,0.0296155,0.056447,0.063469,0.316552,1.0


### Training and Test data split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data[["Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_Duration"]], data[["Label"]], test_size=0.3, random_state=0)

### Composition of Training and test data

In [5]:
y_train.apply(pd.Series.value_counts)

Unnamed: 0,Label
0,86
1,84


In [6]:
y_test.apply(pd.Series.value_counts)

Unnamed: 0,Label
0,38
1,36


## Fitting a XGBoost Model

In [7]:
xgboost = xgboost.XGBClassifier()
xgboost.fit(X_train, y_train)



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

### Comparing Predicted and Actual values

In [8]:
y_pred = xgboost.predict(X_test)

In [9]:
pred = pd.DataFrame(y_pred)
pred["Predicted"] = np.where(pred[0] == 0, "Healthy", "Unhealthy")
pred["Actual"] = np.where(y_test["Label"] == 0, "Healthy", "Unhealthy")
pred[["Predicted", "Actual"]]

Unnamed: 0,Predicted,Actual
0,Healthy,Healthy
1,Unhealthy,Healthy
2,Unhealthy,Healthy
3,Healthy,Healthy
4,Unhealthy,Unhealthy
...,...,...
69,Unhealthy,Healthy
70,Healthy,Healthy
71,Unhealthy,Healthy
72,Unhealthy,Healthy


## Confusion Matrix

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[21 17]
 [ 8 28]]


<Figure size 504x504 with 0 Axes>

## Model Evaluation metrics

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.55      0.63        38
           1       0.62      0.78      0.69        36

    accuracy                           0.66        74
   macro avg       0.67      0.67      0.66        74
weighted avg       0.67      0.66      0.66        74



# Hyperparameter tuning in XGBoost

## Evaluating best performing parameters

In [12]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
param_grid = {
    'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'max_depth': [3,4,5,6,7,8,10,12],
    'min_child_weight': [1,3,5,7],
    'gamma': [0.0,0.1,0.2,0.3,0.4],
    'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.7,0.8,0.9,1],
    'n_estimators': [100,200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

In [13]:
random_search = RandomizedSearchCV(xgboost, param_distributions=param_grid, verbose = 3, cv=6,scoring="roc_auc")

In [14]:
random_search.fit(X_train, y_train)

Fitting 6 folds for each of 10 candidates, totalling 60 fits
[CV] n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3, score=0.633, total=   0.8s
[CV] n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3, score=0.710, total=   0.8s
[CV] n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.6s remaining:    0.0s
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3, score=0.459, total=   0.8s
[CV] n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3, score=0.668, total=   0.8s
[CV] n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3, score=0.561, total=   0.8s
[CV] n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1800, min_child_weight=5, max_depth=12, learning_rate=0.05, gamma=0.0, colsample_bytree=0.3, score=0.628, total=   0.8s
[CV] n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5, score=0.700, total=   0.6s
[CV] n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5, score=0.710, total=   0.7s
[CV] n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5, score=0.571, total=   0.6s
[CV] n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5, score=0.689, total=   0.6s
[CV] n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5, score=0.694, total=   0.7s
[CV] n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=7, max_depth=10, learning_rate=0.25, gamma=0.2, colsample_bytree=0.5, score=0.577, total=   0.6s
[CV] n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7 
[CV]  n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7, score=0.786, total=   0.1s
[CV] n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7, score=0.700, total=   0.1s
[CV] n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7 
[CV]  n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7, score=0.551, total=   0.1s
[CV] n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7, score=0.714, total=   0.1s
[CV] n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7 
[CV]  n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7, score=0.633, total=   0.1s
[CV] n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=100, min_child_weight=1, max_depth=8, learning_rate=0.25, gamma=0.3, colsample_bytree=0.7, score=0.597, total=   0.1s
[CV] n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2, score=0.638, total=   0.7s
[CV] n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2, score=0.681, total=   0.7s
[CV] n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2, score=0.403, total=   0.7s
[CV] n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2, score=0.653, total=   0.7s
[CV] n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2, score=0.561, total=   0.8s
[CV] n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1600, min_child_weight=5, max_depth=7, learning_rate=0.15, gamma=0.3, colsample_bytree=0.2, score=0.648, total=   0.7s
[CV] n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9, score=0.633, total=   0.6s
[CV] n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9, score=0.690, total=   0.6s
[CV] n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9, score=0.546, total=   0.6s
[CV] n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9, score=0.704, total=   0.7s
[CV] n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9, score=0.546, total=   0.6s
[CV] n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1400, min_child_weight=7, max_depth=10, learning_rate=0.3, gamma=0.0, colsample_bytree=0.9, score=0.633, total=   0.7s
[CV] n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2, score=0.614, total=   0.3s
[CV] n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2, score=0.695, total=   0.2s
[CV] n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2, score=0.398, total=   0.3s
[CV] n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2, score=0.668, total=   0.3s
[CV] n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2, score=0.561, total=   0.3s
[CV] n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=400, min_child_weight=5, max_depth=5, learning_rate=0.2, gamma=0.2, colsample_bytree=0.2, score=0.612, total=   0.3s
[CV] n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7, score=0.790, total=   0.7s
[CV] n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7, score=0.633, total=   0.6s
[CV] n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7, score=0.566, total=   0.7s
[CV] n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7, score=0.684, total=   0.6s
[CV] n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7, score=0.556, total=   0.6s
[CV] n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=4, learning_rate=0.2, gamma=0.1, colsample_bytree=0.7, score=0.536, total=   0.6s
[CV] n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8, score=0.795, total=   0.6s
[CV] n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8, score=0.633, total=   0.6s
[CV] n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8, score=0.602, total=   0.6s
[CV] n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8, score=0.689, total=   0.6s
[CV] n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8, score=0.648, total=   0.5s
[CV] n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1200, min_child_weight=3, max_depth=3, learning_rate=0.15, gamma=0.2, colsample_bytree=0.8, score=0.582, total=   0.6s
[CV] n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5, score=0.695, total=   0.3s
[CV] n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5, score=0.724, total=   0.3s
[CV] n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5, score=0.587, total=   0.3s
[CV] n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5, score=0.684, total=   0.3s
[CV] n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5, score=0.684, total=   0.3s
[CV] n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=600, min_child_weight=7, max_depth=6, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5, score=0.622, total=   0.3s
[CV] n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3, score=0.700, total=   0.8s
[CV] n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3, score=0.681, total=   0.8s
[CV] n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3, score=0.296, total=   0.7s
[CV] n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3, score=0.740, total=   0.8s
[CV] n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3, score=0.561, total=   0.9s
[CV] n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV]  n_estimators=1000, min_child_weight=1, max_depth=12, learning_rate=0.15, gamma=0.4, colsample_bytree=0.3, score=0.612, total=   1.1s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   33.2s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=6, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',...
                   param_distributions={'colsample_bytree': [0.1, 0.2, 0.3, 0.4,
                                                             0.5, 0.7, 0.8, 0.9,
                                                             1

In [15]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

## Fitting a XGBoost with best performing parameters

In [16]:
xgboost = random_search.best_estimator_
xgboost.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

### Comparing Predicted and Actual values

In [17]:
y_pred = xgboost.predict(X_test)

In [18]:
pred = pd.DataFrame(y_pred)
pred["Predicted"] = np.where(pred[0] == 0, "Healthy", "Unhealthy")
pred["Actual"] = np.where(y_test["Label"] == 0, "Healthy", "Unhealthy")
pred[["Predicted", "Actual"]]

Unnamed: 0,Predicted,Actual
0,Unhealthy,Healthy
1,Unhealthy,Healthy
2,Unhealthy,Healthy
3,Healthy,Healthy
4,Unhealthy,Unhealthy
...,...,...
69,Unhealthy,Healthy
70,Unhealthy,Healthy
71,Healthy,Healthy
72,Healthy,Healthy


## Recalculating Model Evaluation metrics for Model fitted with best performing parameters

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.55      0.60        38
           1       0.60      0.69      0.64        36

    accuracy                           0.62        74
   macro avg       0.63      0.62      0.62        74
weighted avg       0.63      0.62      0.62        74



In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[21 17]
 [11 25]]


<Figure size 504x504 with 0 Axes>

## Changing Probability threshold to achieve 100% detection of Unhealthy patients

In [21]:
THRESHOLD = 0.22
y_pred = np.where(xgboost.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

## Reevaluating Model Evaluation metrics with new threshold

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.24      0.38        38
           1       0.55      1.00      0.71        36

    accuracy                           0.61        74
   macro avg       0.78      0.62      0.55        74
weighted avg       0.78      0.61      0.54        74



In [23]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[ 9 29]
 [ 0 36]]


<Figure size 504x504 with 0 Axes>