In [None]:
import numpy as np
import pandas as pd
import librosa.display
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/new_features.csv")
df.drop(["Unnamed: 0"], axis=1, inplace=True)
df["patient_id"] = df["patient_id"].astype("int")
df["lead_id"] = df["lead_id"].astype("int")

In [None]:
df.head(5)

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,edr_time_ratio,edr_energy_ratio
0,0,1,-0.054317,377.083333,-3297.249202,5756.845777,57.0,0.404318,3.122865,8.64264,8564.971669,913.046274,0.159116,1.007608,1.011732
1,0,2,0.173118,377.166667,9.640745,2769.939236,57.692308,0.441143,1.907872,2.284969,1435.06784,444.57433,0.159081,0.876788,0.744761
2,0,3,0.803948,411.090909,3226.989388,861.597898,46.833333,0.373397,5.337444,0.085836,-1520.994131,447.789323,0.145953,0.959859,0.946903
3,0,4,0.763358,383.0,2931.324322,2988.418158,66.461538,0.271741,8.208612,0.346546,48.321557,401.243304,0.156658,1.140411,1.100461
4,0,5,-0.058833,411.272727,-3019.002055,4359.39259,43.916667,0.360466,2.422612,9.218017,8780.695173,786.02513,0.145889,1.045801,0.947137


In [None]:
5# we have 2164 patients
# 20% of 2164 is approximately equals to 433 patients
# randomly generating 433 patients
import random
all_patients = list(range(0, 2164))
selected_patients = random.sample(all_patients, 433)

In [None]:
folder_path = "/content/drive/MyDrive/DIS_Dr_Liu/"
small_X_path = folder_path + "small_X.npy"
small_y_path = folder_path + "small_y.npy"
ecg_X = np.load(small_X_path, allow_pickle=True)
ecg_y = np.load(small_y_path, allow_pickle=True)

print('X: ', ecg_X.shape)
print('y: ', ecg_y.shape)

print(ecg_X[0])
print(ecg_y[0])

X:  (2164, 12, 5000)
y:  (2164,)
[[  10   15   -5 ... -107  -88  -39]
 [ -20   -5  -20 ... -122 -112  -63]
 [ -30  -20  -15 ...  -15  -24  -24]
 ...
 [  34   29   49 ...  137  127   83]
 [ -29  -34  -10 ...   83   73   20]
 [ -39  -59  -34 ...  142  122   39]]
1


In [None]:
y = ecg_y
target_df = pd.DataFrame()
target_df["patient_id"] = all_patients
target_df["target"] = y
target_df["patient_id"] = target_df["patient_id"].astype("int")
target_df["target"] = target_df["target"].astype("int")

In [None]:
merged_df = pd.merge(df, target_df, on='patient_id', how='inner')

In [None]:
merged_df.isna().sum()

patient_id             0
lead_id                0
pr_ratio             520
rr_distance          474
p_energy             520
t_energy             520
pq_distance          520
qt_interval          994
st_slope             520
pr_slope             520
qrs_energy           520
rsq                  520
heart_rate           994
edr_time_ratio      2936
edr_energy_ratio    2936
target                 0
dtype: int64

In [None]:
merged_df.dropna(inplace=True)
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,edr_time_ratio,edr_energy_ratio,target
0,0,1,-0.054317,377.083333,-3297.249202,5756.845777,57.000000,0.404318,3.122865,8.642640,8564.971669,913.046274,0.159116,1.007608,1.011732,1
1,0,2,0.173118,377.166667,9.640745,2769.939236,57.692308,0.441143,1.907872,2.284969,1435.067840,444.574330,0.159081,0.876788,0.744761,1
2,0,3,0.803948,411.090909,3226.989388,861.597898,46.833333,0.373397,5.337444,0.085836,-1520.994131,447.789323,0.145953,0.959859,0.946903,1
3,0,4,0.763358,383.000000,2931.324322,2988.418158,66.461538,0.271741,8.208612,0.346546,48.321557,401.243304,0.156658,1.140411,1.100461,1
4,0,5,-0.058833,411.272727,-3019.002055,4359.392590,43.916667,0.360466,2.422612,9.218017,8780.695173,786.025130,0.145889,1.045801,0.947137,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25963,2163,8,0.488058,455.800000,162.286435,2295.213058,67.909091,0.162152,5.545811,0.418477,-4305.583086,635.646834,0.131637,0.953257,0.937737,0
25964,2163,9,-0.016470,471.625000,-9889.188938,-2982.607416,52.888889,0.537857,-1.348335,3.153742,16201.075260,511.905208,0.127220,1.879499,1.736153,0
25965,2163,10,0.477778,428.800000,-3090.423671,-302.615363,64.272727,0.633480,0.746142,1.198969,12511.021144,368.125365,0.139925,1.033777,1.018536,0
25966,2163,11,-0.089861,428.500000,-2825.421094,10080.342482,52.363636,0.384428,4.580217,7.207576,4125.364610,804.815869,0.140023,1.602314,1.578354,0


In [None]:
merged_df.to_csv("/content/drive/MyDrive/DIS_Dr_Liu/new_features_merged.csv")

# Models on all data using Standard Normalization

In [None]:
# merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/new_features.csv")
# merged_df.drop("Unnamed: 0", axis=1, inplace=True)
# merged_df.dropna(inplace=True)

In [None]:
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,edr_time_ratio,edr_energy_ratio,target
0,0,1,-0.054317,377.083333,-3297.249202,5756.845777,57.000000,0.404318,3.122865,8.642640,8564.971669,913.046274,0.159116,1.007608,1.011732,1
1,0,2,0.173118,377.166667,9.640745,2769.939236,57.692308,0.441143,1.907872,2.284969,1435.067840,444.574330,0.159081,0.876788,0.744761,1
2,0,3,0.803948,411.090909,3226.989388,861.597898,46.833333,0.373397,5.337444,0.085836,-1520.994131,447.789323,0.145953,0.959859,0.946903,1
3,0,4,0.763358,383.000000,2931.324322,2988.418158,66.461538,0.271741,8.208612,0.346546,48.321557,401.243304,0.156658,1.140411,1.100461,1
4,0,5,-0.058833,411.272727,-3019.002055,4359.392590,43.916667,0.360466,2.422612,9.218017,8780.695173,786.025130,0.145889,1.045801,0.947137,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25963,2163,8,0.488058,455.800000,162.286435,2295.213058,67.909091,0.162152,5.545811,0.418477,-4305.583086,635.646834,0.131637,0.953257,0.937737,0
25964,2163,9,-0.016470,471.625000,-9889.188938,-2982.607416,52.888889,0.537857,-1.348335,3.153742,16201.075260,511.905208,0.127220,1.879499,1.736153,0
25965,2163,10,0.477778,428.800000,-3090.423671,-302.615363,64.272727,0.633480,0.746142,1.198969,12511.021144,368.125365,0.139925,1.033777,1.018536,0
25966,2163,11,-0.089861,428.500000,-2825.421094,10080.342482,52.363636,0.384428,4.580217,7.207576,4125.364610,804.815869,0.140023,1.602314,1.578354,0


In [None]:
merged_df["target"].value_counts()

0    11610
1    11422
Name: target, dtype: int64

In [None]:
# merged_df = merged_df[merged_df["lead_id"] == 1]
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

X = merged_df.drop(["patient_id", "lead_id", "target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,edr_time_ratio,edr_energy_ratio
0,-0.046984,2.191183,0.042699,0.267034,0.538068,-0.558680,-0.440292,0.112366,0.583948,-0.254993,-1.790346,-0.399139,-0.385596
1,0.011904,1.400235,0.709282,0.566371,0.827654,-0.771497,0.004684,-0.822561,-0.550805,-0.524554,-1.337745,0.102470,0.184304
2,-0.048688,0.476855,0.217214,0.327164,0.649953,-0.716428,-0.361414,0.159849,0.349534,-0.368613,-0.645897,-0.187691,-0.127356
3,-0.018607,0.174273,-0.357011,-0.403609,0.806739,1.852987,-0.684866,-0.774870,2.223288,-0.927853,-0.366130,-0.060106,-0.041094
4,0.038572,-1.034852,0.000481,-0.119488,0.410512,-1.242956,-0.640973,-0.962052,-0.521235,-1.422943,1.171231,-0.235443,-0.231832
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23027,-0.035228,-0.534430,0.291718,-0.380699,-1.314748,0.491801,-0.203422,-0.071380,-0.480029,-0.348813,0.435420,-0.506362,-0.476939
23028,-0.053041,0.228214,-0.038607,-0.383287,0.510718,-0.150341,-0.572176,-0.472879,-0.191678,-0.826669,-0.418346,-0.599778,-0.595213
23029,-0.080438,-1.273140,-1.137129,1.663297,0.120313,0.182485,1.221782,1.503419,0.284597,1.521300,1.589874,-0.529382,-0.555964
23030,-0.012419,-0.572259,0.753883,0.457833,0.208261,-0.067497,-0.037125,0.137406,0.236185,0.138948,0.485314,0.365105,0.321687


# K-Fold

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split

## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(18425, 4607)

In [None]:
y.value_counts()

0    11610
1    11422
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

Cross-Validation Scores: [0.5449118  0.52157395 0.54111262 0.53975577 0.5495251 ]
Mean CV Accuracy: 0.5394


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.521163446928587
F1 Score: 0.47426120114394654
Precision: 0.5187695516162669
Recall: 0.4367866549604917


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

Cross-Validation Scores: [0.61112619 0.5641791  0.58968792 0.58751696 0.59077341]
Mean CV Accuracy: 0.5887


In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5908400260473193
F1 Score: 0.5834254143646409
Precision: 0.5874499332443258
Recall: 0.5794556628621598


## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

Cross-Validation Scores: [0.6339213  0.62089552 0.62306649 0.62523745 0.62415197]
Mean CV Accuracy: 0.6255


In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6229650531799436
F1 Score: 0.6049579258585399
Precision: 0.6276545540349221
Recall: 0.5838454784899034


## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

Cross-Validation Scores: [0.60434193 0.5880597  0.60868385 0.60569878 0.592673  ]
Mean CV Accuracy: 0.5999


In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5999565878011721
F1 Score: 0.5899888765294773
Precision: 0.5981055480378891
Recall: 0.582089552238806


## SVM

In [None]:
svm_model = SVC()

k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

Cross-Validation Scores: [0.56255088 0.54708277 0.5568521  0.55386703 0.54898236]
Mean CV Accuracy: 0.5539


In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5545908400260473
F1 Score: 0.5043478260869566
Precision: 0.560687432867884
Recall: 0.45829675153643545


## Naive Bayes

In [None]:
# Create Naive Bayes model
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

Cross-Validation Scores: [0.50881954 0.4963365  0.50230665 0.4963365  0.49932157]
Mean CV Accuracy: 0.5006


In [None]:
# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Naive Bayes Accuracy: 0.5001085304970697
Naive Bayes F1 Score: 0.6607747827367801
Naive Bayes Precision: 0.4972289957880736
Naive Bayes Recall: 0.9846356453028973


## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

Cross-Validation Scores: [0.56119403 0.53894166 0.56580733 0.55033921 0.54762551]
Mean CV Accuracy: 0.5528
KNN Accuracy: 0.5461254612546126
KNN F1 Score: 0.5308503477675566
KNN Precision: 0.5429095915557596
KNN Recall: 0.5193151887620719


# Models on only Lead 1 using Standard Normalization

In [None]:
# merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
# merged_df.drop("Unnamed: 0", axis=1, inplace=True)
# merged_df.dropna(inplace=True)

In [None]:
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,edr_time_ratio,edr_energy_ratio,target
0,58,5,0.033059,627.714286,-608.866378,5662.479450,64.625000,0.307863,1.535486,6.061934,9044.098382,667.320257,0.095585,0.811429,0.828083,1
1,1162,8,0.247654,559.285714,1462.556417,7272.910631,69.125000,0.279598,3.726862,1.381566,1029.119601,520.640095,0.107280,1.338866,1.384437,0
2,1559,11,0.026852,479.400000,-66.557518,5985.980743,66.363636,0.286912,1.923937,6.299642,7388.384568,605.494318,0.125156,1.033764,1.080185,0
3,440,8,0.136468,453.222222,-1850.974754,2054.429475,68.800000,0.628169,0.331033,1.620315,20623.079710,301.187232,0.132385,1.167919,1.164396,1
4,1782,8,0.344836,348.615385,-740.059638,3582.996739,62.642857,0.216981,0.547191,0.683259,1237.975415,31.787006,0.172109,0.983553,0.978192,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23027,336,1,0.075901,391.909091,164.966983,2177.681362,35.833333,0.447383,2.702003,5.142076,1529.023856,616.268564,0.153097,0.698685,0.738911,1
23028,2126,5,0.010988,457.888889,-861.527087,2163.759375,64.200000,0.362097,0.885998,3.132121,3565.702533,356.245737,0.131036,0.600458,0.623448,0
23029,490,11,-0.088850,328.000000,-4275.209784,13174.365885,58.133333,0.406301,9.720716,13.025726,6929.723222,1633.880119,0.182927,0.674479,0.661765,1
23030,1960,2,0.159019,388.636364,1601.155971,6688.979929,59.500000,0.373099,3.520967,6.187284,6587.781222,881.681231,0.154386,1.615023,1.518554,0


In [None]:
merged_df["target"].value_counts()

0    11610
1    11422
Name: target, dtype: int64

In [None]:
merged_df = merged_df[merged_df["lead_id"] == 1]
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

X = merged_df.drop(["patient_id", "lead_id", "target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,edr_time_ratio,edr_energy_ratio
0,-0.293335,-1.391828,-1.178829,1.772481,-1.202919,0.298666,1.731959,0.021327,-0.830054,0.106362,1.746022,0.227704,0.139820
1,1.510327,-0.984895,0.259399,-0.223974,-0.867705,0.954691,0.013027,-1.477935,-1.470962,-1.099140,1.055124,-0.753965,-0.774438
2,0.164359,-1.280182,0.408493,-0.877309,-0.553375,1.971233,0.206872,0.391772,-0.206159,0.885322,1.542699,0.950998,1.055551
3,-0.333668,0.775731,-0.306067,-0.497854,-0.880106,-0.716782,-0.215945,1.713947,0.832489,0.788641,-0.875634,-0.396294,-0.386283
4,0.504545,0.452386,1.066421,0.955040,0.139874,-1.028479,0.529656,-0.787481,-0.788239,-0.604424,-0.608942,0.025451,-0.051615
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,-0.374031,-0.993403,-0.073782,-1.358833,0.062368,0.847359,-1.159253,-0.073216,0.321227,-0.520984,1.068247,-0.114611,-0.196588
1950,-0.030549,-0.814026,0.371296,-1.204676,-0.911824,1.469731,-1.040431,0.162917,0.047693,-0.430623,0.802009,-0.683337,-0.690406
1951,-0.281185,2.830189,0.071649,1.222030,-0.673940,-1.612096,-0.324858,-0.800418,-0.622433,-1.223302,-2.077317,-0.032933,0.348626
1952,-0.049917,0.501270,0.645313,-1.394133,1.890478,1.264570,-1.201899,-0.610669,0.019261,-0.383996,-0.651154,-0.317868,-0.299178


# K-Fold

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split

## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(1563, 391)

In [None]:
y.value_counts()

0    983
1    971
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

Cross-Validation Scores: [0.65495208 0.61022364 0.64217252 0.66025641 0.56410256]
Mean CV Accuracy: 0.6263


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6342710997442456
F1 Score: 0.6304909560723513
Precision: 0.6192893401015228
Recall: 0.6421052631578947


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

Cross-Validation Scores: [0.55591054 0.58785942 0.54632588 0.55448718 0.5224359 ]
Mean CV Accuracy: 0.5534


In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5421994884910486
F1 Score: 0.5350649350649351
Precision: 0.5282051282051282
Recall: 0.5421052631578948


## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

Cross-Validation Scores: [0.62300319 0.57507987 0.5942492  0.63782051 0.57051282]
Mean CV Accuracy: 0.6001


In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5959079283887468
F1 Score: 0.5659340659340659
Precision: 0.5919540229885057
Recall: 0.5421052631578948


## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

Cross-Validation Scores: [0.57827476 0.58785942 0.5942492  0.61858974 0.56089744]
Mean CV Accuracy: 0.5880


In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5677749360613811
F1 Score: 0.5318559556786703
Precision: 0.5614035087719298
Recall: 0.5052631578947369


## SVM

In [None]:
svm_model = SVC()

k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

Cross-Validation Scores: [0.64536741 0.58785942 0.61341853 0.66025641 0.58333333]
Mean CV Accuracy: 0.6180


In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.6214833759590793
F1 Score: 0.616580310880829
Precision: 0.6071428571428571
Recall: 0.6263157894736842


## Naive Bayes

In [None]:
# Create Naive Bayes model
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

Cross-Validation Scores: [0.64536741 0.61661342 0.60383387 0.62179487 0.61858974]
Mean CV Accuracy: 0.6212


In [None]:
# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Naive Bayes Accuracy: 0.6342710997442456
Naive Bayes F1 Score: 0.6246719160104987
Naive Bayes Precision: 0.6230366492146597
Naive Bayes Recall: 0.6263157894736842


## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

Cross-Validation Scores: [0.59744409 0.55591054 0.57827476 0.59615385 0.55448718]
Mean CV Accuracy: 0.5765
KNN Accuracy: 0.5856777493606138
KNN F1 Score: 0.5645161290322581
KNN Precision: 0.5769230769230769
KNN Recall: 0.5526315789473685


# Models trained on Lead 1+2+3 = 33 features using Standard Normalization

In [None]:
# merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
# merged_df.drop("Unnamed: 0", axis=1, inplace=True)
# # merged_df.dropna(inplace=True)

In [None]:
merged_df = merged_df[merged_df["lead_id"] < 4]
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,edr_time_ratio,edr_energy_ratio,target
0,0,1,-0.054317,377.083333,-3297.249202,5756.845777,57.000000,0.404318,3.122865,8.642640,8564.971669,913.046274,0.159116,1.007608,1.011732,1
1,0,2,0.173118,377.166667,9.640745,2769.939236,57.692308,0.441143,1.907872,2.284969,1435.067840,444.574330,0.159081,0.876788,0.744761,1
2,0,3,0.803948,411.090909,3226.989388,861.597898,46.833333,0.373397,5.337444,0.085836,-1520.994131,447.789323,0.145953,0.959859,0.946903,1
12,1,1,-0.040611,433.400000,-1001.362997,3334.747737,54.090909,0.379452,1.114519,5.299051,4941.067796,513.379424,0.138440,0.942913,0.786049,1
13,1,2,-0.116662,433.400000,-2121.373640,5989.372179,66.545455,0.395603,1.574844,3.992526,3737.493466,458.979911,0.138440,0.959791,0.840727,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25945,2162,2,-0.011198,450.100000,-1142.633178,3343.033188,58.272727,0.391226,1.584090,3.212130,3125.760542,430.052537,0.133304,0.989151,0.950095,0
25946,2162,3,0.224792,514.571429,1482.322294,966.165778,77.125000,0.201624,3.504933,0.637457,391.069545,173.950753,0.116602,1.174211,0.829396,0
25956,2163,1,-0.054141,414.272727,-2329.281213,8778.893590,48.833333,0.352425,3.521925,10.374062,7905.975939,845.097061,0.144832,1.063577,1.035460,0
25957,2163,2,0.456030,428.900000,-1717.653835,-1420.246937,87.181818,0.620827,0.403618,0.474798,3695.413494,225.542573,0.139893,1.652199,1.513601,0


In [None]:
merged_df.columns

Index(['patient_id', 'lead_id', 'pr_ratio', 'rr_distance', 'p_energy',
       't_energy', 'pq_distance', 'qt_interval', 'st_slope', 'pr_slope',
       'qrs_energy', 'rsq', 'heart_rate', 'edr_time_ratio', 'edr_energy_ratio',
       'target'],
      dtype='object')

In [None]:
pivoted_df = merged_df.pivot(index='patient_id', columns='lead_id',\
                             values=['pr_ratio', 'rr_distance', 'p_energy',\
                                     't_energy', 'pq_distance', 'qt_interval',\
                                     'st_slope', 'pr_slope', 'qrs_energy',\
                                     'rsq', 'heart_rate', 'edr_time_ratio', 'edr_energy_ratio'])
pivoted_df.sort_index(axis=1, level=1, inplace=True)


In [None]:
pivoted_df

Unnamed: 0_level_0,edr_energy_ratio,edr_time_ratio,heart_rate,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,...,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,t_energy
lead_id,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1.011732,1.007608,0.159116,-3297.249202,57.000000,-0.054317,8.642640,8564.971669,0.404318,377.083333,...,3226.989388,46.833333,0.803948,0.085836,-1520.994131,0.373397,411.090909,447.789323,5.337444,861.597898
1,0.786049,0.942913,0.138440,-1001.362997,54.090909,-0.040611,5.299051,4941.067796,0.379452,433.400000,...,,,,,,,,,,
2,1.089178,1.237727,0.148791,-1269.905400,54.692308,-0.023139,9.273605,7967.382615,0.362249,403.250000,...,,,,,,,,,,
3,1.149821,1.001017,0.118838,-553.935848,33.100000,0.072957,6.661890,4107.674542,0.327201,504.888889,...,-544.047169,45.818182,0.033780,9.495492,10909.892147,0.370118,454.400000,834.657670,1.057267,1834.405712
4,0.748991,0.741321,0.111784,609.882044,49.555556,0.076716,4.808076,3653.719432,0.284635,536.750000,...,1035.058073,50.000000,0.145537,5.048859,6852.051662,0.324587,536.750000,521.583953,0.960575,2359.788690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2159,0.722308,0.725112,0.140695,-189.504576,66.083333,0.034634,7.767104,8672.426609,0.335518,426.454545,...,667.231788,83.363636,0.047764,4.905699,6861.427506,0.494786,420.200000,657.597423,0.388304,781.622078
2160,0.839059,0.805867,0.149694,1087.134766,40.250000,0.205243,3.522456,2614.849116,0.513117,400.818182,...,-916.645080,53.416667,-0.012808,2.816251,2741.448363,0.474030,400.818182,341.909914,0.871615,1748.550437
2161,1.142280,1.028140,0.138857,-172.382407,98.636364,0.013340,7.412361,11220.377567,0.365235,432.100000,...,2099.909185,94.800000,0.269638,1.856591,-2455.329029,0.357126,480.222222,1106.582946,8.095787,3069.727779
2162,0.582093,0.665277,0.133274,-325.481920,55.363636,0.043881,5.365004,5763.817482,0.369331,450.200000,...,1482.322294,77.125000,0.224792,0.637457,391.069545,0.201624,514.571429,173.950753,3.504933,966.165778


In [None]:
pivoted_df = pivoted_df.reset_index()
pivoted_df

Unnamed: 0_level_0,index,patient_id,edr_energy_ratio,edr_time_ratio,heart_rate,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,...,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,t_energy
lead_id,Unnamed: 1_level_1,Unnamed: 2_level_1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
0,0,0,1.011732,1.007608,0.159116,-3297.249202,57.000000,-0.054317,8.642640,8564.971669,...,3226.989388,46.833333,0.803948,0.085836,-1520.994131,0.373397,411.090909,447.789323,5.337444,861.597898
1,1,1,0.786049,0.942913,0.138440,-1001.362997,54.090909,-0.040611,5.299051,4941.067796,...,,,,,,,,,,
2,2,2,1.089178,1.237727,0.148791,-1269.905400,54.692308,-0.023139,9.273605,7967.382615,...,,,,,,,,,,
3,3,3,1.149821,1.001017,0.118838,-553.935848,33.100000,0.072957,6.661890,4107.674542,...,-544.047169,45.818182,0.033780,9.495492,10909.892147,0.370118,454.400000,834.657670,1.057267,1834.405712
4,4,4,0.748991,0.741321,0.111784,609.882044,49.555556,0.076716,4.808076,3653.719432,...,1035.058073,50.000000,0.145537,5.048859,6852.051662,0.324587,536.750000,521.583953,0.960575,2359.788690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2118,2118,2159,0.722308,0.725112,0.140695,-189.504576,66.083333,0.034634,7.767104,8672.426609,...,667.231788,83.363636,0.047764,4.905699,6861.427506,0.494786,420.200000,657.597423,0.388304,781.622078
2119,2119,2160,0.839059,0.805867,0.149694,1087.134766,40.250000,0.205243,3.522456,2614.849116,...,-916.645080,53.416667,-0.012808,2.816251,2741.448363,0.474030,400.818182,341.909914,0.871615,1748.550437
2120,2120,2161,1.142280,1.028140,0.138857,-172.382407,98.636364,0.013340,7.412361,11220.377567,...,2099.909185,94.800000,0.269638,1.856591,-2455.329029,0.357126,480.222222,1106.582946,8.095787,3069.727779
2121,2121,2162,0.582093,0.665277,0.133274,-325.481920,55.363636,0.043881,5.365004,5763.817482,...,1482.322294,77.125000,0.224792,0.637457,391.069545,0.201624,514.571429,173.950753,3.504933,966.165778


In [None]:
merged_df = pd.merge(pivoted_df, target_df, on='patient_id', how='inner')

  merged_df = pd.merge(pivoted_df, target_df, on='patient_id', how='inner')


In [None]:
merged_df.dropna(inplace=True)

In [None]:
merged_df.columns

Index([           'patient_id',           ('index', ''),
            ('patient_id', ''), ('edr_energy_ratio', 1),
         ('edr_time_ratio', 1),       ('heart_rate', 1),
               ('p_energy', 1),      ('pq_distance', 1),
               ('pr_ratio', 1),         ('pr_slope', 1),
             ('qrs_energy', 1),      ('qt_interval', 1),
            ('rr_distance', 1),              ('rsq', 1),
               ('st_slope', 1),         ('t_energy', 1),
       ('edr_energy_ratio', 2),   ('edr_time_ratio', 2),
             ('heart_rate', 2),         ('p_energy', 2),
            ('pq_distance', 2),         ('pr_ratio', 2),
               ('pr_slope', 2),       ('qrs_energy', 2),
            ('qt_interval', 2),      ('rr_distance', 2),
                    ('rsq', 2),         ('st_slope', 2),
               ('t_energy', 2), ('edr_energy_ratio', 3),
         ('edr_time_ratio', 3),       ('heart_rate', 3),
               ('p_energy', 3),      ('pq_distance', 3),
               ('pr_ratio', 3),

In [None]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

merged_df = merged_df.iloc[:,2:]

X = merged_df.drop(["target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,"(patient_id, )","(edr_energy_ratio, 1)","(edr_time_ratio, 1)","(heart_rate, 1)","(p_energy, 1)","(pq_distance, 1)","(pr_ratio, 1)","(pr_slope, 1)","(qrs_energy, 1)","(qt_interval, 1)",...,"(p_energy, 3)","(pq_distance, 3)","(pr_ratio, 3)","(pr_slope, 3)","(qrs_energy, 3)","(qt_interval, 3)","(rr_distance, 3)","(rsq, 3)","(st_slope, 3)","(t_energy, 3)"
0,0.341334,-0.220273,-0.214428,-0.849329,1.148803,-0.062932,0.010175,0.357425,0.981308,-0.455400,...,-1.706357,-0.243314,0.563315,-0.795070,-0.400136,-1.190940,0.356384,-0.963840,-0.476663,-0.196169
1,1.181725,-0.081520,-0.208822,-0.478126,0.513504,0.801148,0.082042,-0.214410,1.640345,0.105167,...,-0.944458,1.211420,0.372335,-0.701393,1.406257,1.510336,0.010343,-0.776535,-0.448805,0.406007
2,-1.666267,0.770936,0.868549,1.432870,0.702118,-1.510266,0.557006,-0.538492,-0.775771,0.251513,...,0.109609,0.132416,0.624068,-0.824355,-0.070570,0.522835,-0.316255,-1.024807,-0.403859,-0.337130
3,0.339724,-0.309824,-0.296390,-1.252567,0.700847,0.470717,0.255249,-0.898403,-0.763552,-0.845439,...,-0.452118,0.714649,-0.229997,-0.146878,0.036405,-0.577335,0.785164,-0.487244,-0.357418,0.297424
4,1.339500,-0.674739,-0.643785,-0.551614,-2.026886,0.285973,-0.535074,2.034747,0.865942,-0.840207,...,-2.026698,-2.064019,0.188784,0.166234,-1.759964,1.251248,0.069674,3.199130,0.109079,0.473895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,-0.442709,-0.435402,-0.449073,0.739202,-0.208910,-1.021728,-0.151326,1.600590,0.923769,-0.361715,...,0.328385,0.957681,0.194829,-0.698435,-0.283627,1.024499,-0.446984,-1.042392,-0.140141,-1.290064
1638,1.709787,0.017180,-0.028569,-0.186894,-0.134929,0.304302,-0.116597,0.123304,-0.430203,0.559298,...,0.902726,-0.343956,0.503740,-0.816431,-0.736421,0.202251,-0.332569,-0.525038,0.234811,-0.148071
1639,0.532917,2.292252,2.566927,0.513547,0.251829,-1.132508,-0.008047,0.267985,-0.179090,-0.149350,...,0.243213,0.025065,-0.051945,-0.337138,-0.374393,0.940009,-0.706106,-0.481326,-0.544074,-0.316830
1640,-0.788847,-0.292205,-0.277503,-0.247726,0.961299,0.090682,0.332232,-0.652805,-0.622248,0.465241,...,0.408760,1.624154,-0.083501,-0.147274,-0.304651,0.293736,-0.442820,0.437513,-0.398738,-0.076649


## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(1313, 329)

In [None]:
y.value_counts()

0    826
1    816
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

Cross-Validation Scores: [0.97338403 0.98098859 0.98859316 0.98854962 0.98473282]
Mean CV Accuracy: 0.9832


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.9787234042553191
F1 Score: 0.9768976897689768
Precision: 0.9932885906040269
Recall: 0.961038961038961


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0000


In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0


## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

Cross-Validation Scores: [1.         1.         1.         1.         0.99618321]
Mean CV Accuracy: 0.9992


In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0


## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

Cross-Validation Scores: [1.         1.         1.         1.         0.99236641]
Mean CV Accuracy: 0.9985


In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0


## SVM

In [None]:
svm_model = SVC()

k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

Cross-Validation Scores: [0.93155894 0.93155894 0.94296578 0.94274809 0.96183206]
Mean CV Accuracy: 0.9421


In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.9209726443768997
F1 Score: 0.915032679738562
Precision: 0.9210526315789473
Recall: 0.9090909090909091


## Naive Bayes

In [None]:
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Cross-Validation Scores: [0.90494297 0.91634981 0.94296578 0.91984733 0.90839695]
Mean CV Accuracy: 0.9185
Naive Bayes Accuracy: 0.8936170212765957
Naive Bayes F1 Score: 0.8844884488448844
Naive Bayes Precision: 0.8993288590604027
Naive Bayes Recall: 0.8701298701298701


## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

Cross-Validation Scores: [0.79087452 0.86311787 0.85171103 0.83206107 0.84732824]
Mean CV Accuracy: 0.8370
KNN Accuracy: 0.817629179331307
KNN F1 Score: 0.8076923076923078
KNN Precision: 0.7974683544303798
KNN Recall: 0.8181818181818182


# Models trained on Lead 1+2+3 ...+12 = 132 features using Standard Normalization

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/new_features_merged.csv")
merged_df.drop("Unnamed: 0", axis=1, inplace=True)
merged_df.dropna(inplace=True)

In [None]:
# merged_df = merged_df[merged_df["lead_id"] < 4]
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,edr_time_ratio,edr_energy_ratio,target
0,0,1,-0.054317,377.083333,-3297.249202,5756.845777,57.000000,0.404318,3.122865,8.642640,8564.971669,913.046274,0.159116,1.007608,1.011732,1
1,0,2,0.173118,377.166667,9.640745,2769.939236,57.692308,0.441143,1.907872,2.284969,1435.067840,444.574330,0.159081,0.876788,0.744761,1
2,0,3,0.803948,411.090909,3226.989388,861.597898,46.833333,0.373397,5.337444,0.085836,-1520.994131,447.789323,0.145953,0.959859,0.946903,1
3,0,4,0.763358,383.000000,2931.324322,2988.418158,66.461538,0.271741,8.208612,0.346546,48.321557,401.243304,0.156658,1.140411,1.100461,1
4,0,5,-0.058833,411.272727,-3019.002055,4359.392590,43.916667,0.360466,2.422612,9.218017,8780.695173,786.025130,0.145889,1.045801,0.947137,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23027,2163,8,0.488058,455.800000,162.286435,2295.213058,67.909091,0.162152,5.545811,0.418477,-4305.583086,635.646834,0.131637,0.953257,0.937737,0
23028,2163,9,-0.016470,471.625000,-9889.188938,-2982.607416,52.888889,0.537857,-1.348335,3.153742,16201.075260,511.905208,0.127220,1.879499,1.736153,0
23029,2163,10,0.477778,428.800000,-3090.423671,-302.615363,64.272727,0.633480,0.746142,1.198969,12511.021144,368.125365,0.139925,1.033777,1.018536,0
23030,2163,11,-0.089861,428.500000,-2825.421094,10080.342482,52.363636,0.384428,4.580217,7.207576,4125.364610,804.815869,0.140023,1.602314,1.578354,0


In [None]:
merged_df.columns

Index(['patient_id', 'lead_id', 'pr_ratio', 'rr_distance', 'p_energy',
       't_energy', 'pq_distance', 'qt_interval', 'st_slope', 'pr_slope',
       'qrs_energy', 'rsq', 'heart_rate', 'edr_time_ratio', 'edr_energy_ratio',
       'target'],
      dtype='object')

In [None]:
pivoted_df = merged_df.pivot(index='patient_id', columns='lead_id',\
                             values=['pr_ratio', 'rr_distance', 'p_energy',\
                                     't_energy', 'pq_distance', 'qt_interval',\
                                     'st_slope', 'pr_slope', 'qrs_energy',\
                                     'rsq', 'heart_rate', 'edr_time_ratio', 'edr_energy_ratio'])
pivoted_df.sort_index(axis=1, level=1, inplace=True)


In [None]:
pivoted_df

Unnamed: 0_level_0,edr_energy_ratio,edr_time_ratio,heart_rate,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,...,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,t_energy
lead_id,1,1,1,1,1,1,1,1,1,1,...,12,12,12,12,12,12,12,12,12,12
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1.011732,1.007608,0.159116,-3297.249202,57.000000,-0.054317,8.642640,8564.971669,0.404318,377.083333,...,-2098.180632,60.769231,-0.055422,5.886681,6770.553099,0.423604,377.166667,599.476494,1.221051,3837.602229
1,0.786049,0.942913,0.138440,-1001.362997,54.090909,-0.040611,5.299051,4941.067796,0.379452,433.400000,...,-2152.072656,52.000000,-0.060623,8.317649,7397.271479,0.393716,433.400000,709.301096,1.606917,6137.914844
2,1.089178,1.237727,0.148791,-1269.905400,54.692308,-0.023139,9.273605,7967.382615,0.362249,403.250000,...,,,,,,,,,,
3,1.149821,1.001017,0.118838,-553.935848,33.100000,0.072957,6.661890,4107.674542,0.327201,504.888889,...,-2151.372463,38.272727,-0.031627,12.522358,9223.532965,0.368318,454.400000,938.302902,1.805229,3623.569420
4,0.748991,0.741321,0.111784,609.882044,49.555556,0.076716,4.808076,3653.719432,0.284635,536.750000,...,-815.088405,41.555556,0.012516,16.558843,14588.307639,0.302231,536.750000,1351.676935,3.293137,9034.145288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2159,0.722308,0.725112,0.140695,-189.504576,66.083333,0.034634,7.767104,8672.426609,0.335518,426.454545,...,-897.038099,72.909091,-0.005981,8.518245,11093.410126,0.374153,420.100000,1071.009842,2.013878,1834.558665
2160,0.839059,0.805867,0.149694,1087.134766,40.250000,0.205243,3.522456,2614.849116,0.513117,400.818182,...,-381.743527,48.750000,0.026281,6.119403,5792.983361,0.487960,400.818182,610.356882,1.061287,1522.876237
2161,1.142280,1.028140,0.138857,-172.382407,98.636364,0.013340,7.412361,11220.377567,0.365235,432.100000,...,-439.268862,97.200000,0.010766,5.340615,7199.084107,0.362545,432.222222,975.550223,3.473676,5389.586808
2162,0.582093,0.665277,0.133274,-325.481920,55.363636,0.043881,5.365004,5763.817482,0.369331,450.200000,...,-1492.569572,63.454545,-0.024453,6.869237,6756.122534,0.377896,450.100000,851.385491,3.137813,7244.218699


In [None]:
pivoted_df = pivoted_df.reset_index()
pivoted_df

Unnamed: 0_level_0,patient_id,edr_energy_ratio,edr_time_ratio,heart_rate,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,...,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,t_energy
lead_id,Unnamed: 1_level_1,1,1,1,1,1,1,1,1,1,...,12,12,12,12,12,12,12,12,12,12
0,0,1.011732,1.007608,0.159116,-3297.249202,57.000000,-0.054317,8.642640,8564.971669,0.404318,...,-2098.180632,60.769231,-0.055422,5.886681,6770.553099,0.423604,377.166667,599.476494,1.221051,3837.602229
1,1,0.786049,0.942913,0.138440,-1001.362997,54.090909,-0.040611,5.299051,4941.067796,0.379452,...,-2152.072656,52.000000,-0.060623,8.317649,7397.271479,0.393716,433.400000,709.301096,1.606917,6137.914844
2,2,1.089178,1.237727,0.148791,-1269.905400,54.692308,-0.023139,9.273605,7967.382615,0.362249,...,,,,,,,,,,
3,3,1.149821,1.001017,0.118838,-553.935848,33.100000,0.072957,6.661890,4107.674542,0.327201,...,-2151.372463,38.272727,-0.031627,12.522358,9223.532965,0.368318,454.400000,938.302902,1.805229,3623.569420
4,4,0.748991,0.741321,0.111784,609.882044,49.555556,0.076716,4.808076,3653.719432,0.284635,...,-815.088405,41.555556,0.012516,16.558843,14588.307639,0.302231,536.750000,1351.676935,3.293137,9034.145288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2157,2159,0.722308,0.725112,0.140695,-189.504576,66.083333,0.034634,7.767104,8672.426609,0.335518,...,-897.038099,72.909091,-0.005981,8.518245,11093.410126,0.374153,420.100000,1071.009842,2.013878,1834.558665
2158,2160,0.839059,0.805867,0.149694,1087.134766,40.250000,0.205243,3.522456,2614.849116,0.513117,...,-381.743527,48.750000,0.026281,6.119403,5792.983361,0.487960,400.818182,610.356882,1.061287,1522.876237
2159,2161,1.142280,1.028140,0.138857,-172.382407,98.636364,0.013340,7.412361,11220.377567,0.365235,...,-439.268862,97.200000,0.010766,5.340615,7199.084107,0.362545,432.222222,975.550223,3.473676,5389.586808
2160,2162,0.582093,0.665277,0.133274,-325.481920,55.363636,0.043881,5.365004,5763.817482,0.369331,...,-1492.569572,63.454545,-0.024453,6.869237,6756.122534,0.377896,450.100000,851.385491,3.137813,7244.218699


In [None]:
merged_df = pd.merge(pivoted_df, target_df, on='patient_id', how='inner')

  merged_df = pd.merge(pivoted_df, target_df, on='patient_id', how='inner')


In [None]:
merged_df.dropna(inplace=True)

In [None]:
merged_df.columns

Index([           'patient_id',      ('patient_id', ''),
       ('edr_energy_ratio', 1),   ('edr_time_ratio', 1),
             ('heart_rate', 1),         ('p_energy', 1),
            ('pq_distance', 1),         ('pr_ratio', 1),
               ('pr_slope', 1),       ('qrs_energy', 1),
       ...
           ('pq_distance', 12),        ('pr_ratio', 12),
              ('pr_slope', 12),      ('qrs_energy', 12),
           ('qt_interval', 12),     ('rr_distance', 12),
                   ('rsq', 12),        ('st_slope', 12),
              ('t_energy', 12),                'target'],
      dtype='object', length=159)

In [None]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

merged_df = merged_df.iloc[:,2:]

X = merged_df.drop(["target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,"(edr_energy_ratio, 1)","(edr_time_ratio, 1)","(heart_rate, 1)","(p_energy, 1)","(pq_distance, 1)","(pr_ratio, 1)","(pr_slope, 1)","(qrs_energy, 1)","(qt_interval, 1)","(rr_distance, 1)",...,"(p_energy, 12)","(pq_distance, 12)","(pr_ratio, 12)","(pr_slope, 12)","(qrs_energy, 12)","(qt_interval, 12)","(rr_distance, 12)","(rsq, 12)","(st_slope, 12)","(t_energy, 12)"
0,-0.052437,-0.169453,-0.226711,0.575978,0.301798,0.365454,-0.851534,-0.191470,1.607898,0.061044,...,0.139993,0.240780,-0.075732,-0.629336,0.170443,1.107921,0.093897,-0.773542,-1.105739,-1.183359
1,0.072125,0.070788,0.096530,-0.600759,-0.727688,-0.330033,2.150838,1.157525,0.167555,-0.247788,...,-0.508644,-0.308052,-0.398602,1.000170,1.291414,0.368140,-0.215760,0.549814,-1.352634,-1.513150
2,0.718580,0.635588,0.890006,-0.051293,-1.124423,-0.297968,5.118199,4.072044,0.890689,-0.885730,...,-0.581827,0.151502,-0.447472,0.353416,0.820688,0.798266,-0.851799,0.068125,-0.819590,-0.972558
3,-0.404368,-0.366754,0.077091,0.070914,-0.540516,-0.141982,-0.202072,-0.331211,-0.620194,-0.230120,...,-0.678634,-0.140047,-0.436305,1.091086,0.650633,-0.592718,-0.196936,1.162322,0.879088,0.237093
4,-0.362130,-0.365963,1.191969,0.887806,-0.620996,0.038335,3.406976,2.215789,1.377089,-1.092307,...,0.347184,-0.702686,0.104814,1.950351,1.110581,1.212606,-1.058140,2.497790,1.043103,-0.628115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1214,-0.139728,-0.261660,-0.047225,0.446313,-0.252844,0.336413,-1.009084,-0.978827,-0.373751,-0.114477,...,0.575765,-0.510025,0.865068,-1.103794,-1.094796,-0.336285,-0.082707,-1.025185,0.216633,0.504011
1215,-0.499064,-0.575375,-0.248819,0.244905,1.197505,0.060578,-0.996696,-0.688005,-0.381909,0.083402,...,0.204779,0.969885,-0.059485,-0.659265,-0.294369,-0.381882,0.117513,-0.699719,-1.011779,-0.630932
1216,-0.521792,-0.531412,1.255615,0.090628,-1.415520,-0.073085,0.495891,0.067850,0.369859,-1.133700,...,0.115938,-1.374101,-0.089178,2.425008,1.333080,0.326618,-1.099486,1.172574,-0.178767,-0.248521
1217,-0.168234,0.419328,-0.777573,-0.023316,-1.341205,-0.436018,-0.167387,-0.285032,0.187801,0.672899,...,-0.597174,-0.183012,-1.591641,-0.986466,-0.950445,0.012784,0.706623,-1.121599,-0.847228,-0.699299


## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(975, 244)

In [None]:
y.value_counts()

0    611
1    608
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-Validation Scores: [0.58461538 0.63076923 0.60512821 0.61025641 0.64615385]
Mean CV Accuracy: 0.6154


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6434426229508197
F1 Score: 0.6233766233766234
Precision: 0.6428571428571429
Recall: 0.6050420168067226


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

Cross-Validation Scores: [0.53846154 0.59487179 0.58974359 0.57948718 0.54871795]
Mean CV Accuracy: 0.5703


In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5573770491803278
F1 Score: 0.5462184873949579
Precision: 0.5462184873949579
Recall: 0.5462184873949579


## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

Cross-Validation Scores: [0.61538462 0.67692308 0.64102564 0.62564103 0.58461538]
Mean CV Accuracy: 0.6287


In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6721311475409836
F1 Score: 0.6666666666666667
Precision: 0.6611570247933884
Recall: 0.6722689075630253


## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

Cross-Validation Scores: [0.57435897 0.62564103 0.60512821 0.63076923 0.67692308]
Mean CV Accuracy: 0.6226


In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6311475409836066
F1 Score: 0.6370967741935483
Precision: 0.6124031007751938
Recall: 0.6638655462184874


## SVM

In [None]:
svm_model = SVC()

k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

Cross-Validation Scores: [0.62051282 0.68717949 0.62051282 0.65128205 0.68717949]
Mean CV Accuracy: 0.6533


In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.6598360655737705
F1 Score: 0.6612244897959184
Precision: 0.6428571428571429
Recall: 0.680672268907563


## Naive Bayes

In [None]:
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Cross-Validation Scores: [0.60512821 0.6        0.57948718 0.52307692 0.58461538]
Mean CV Accuracy: 0.5785
Naive Bayes Accuracy: 0.639344262295082
Naive Bayes F1 Score: 0.6589147286821706
Naive Bayes Precision: 0.6115107913669064
Naive Bayes Recall: 0.7142857142857143


## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

Cross-Validation Scores: [0.53333333 0.58974359 0.60512821 0.55897436 0.56923077]
Mean CV Accuracy: 0.5713
KNN Accuracy: 0.5327868852459017
KNN F1 Score: 0.5511811023622047
KNN Precision: 0.5185185185185185
KNN Recall: 0.5882352941176471
