In [None]:
import numpy as np
import pandas as pd
import librosa.display
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df1 = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/features1.csv")

df2 = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/features2.csv")
df2 = df2.sort_values(by=["Patient ID", "Lead ID"])

In [None]:
df1 = df1[df1["Patient ID"] < 25000]
df2 = df2[df2["Patient ID"] >= 25000]

In [None]:
df = pd.concat([df1, df2])
df.to_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/complete_dataset.csv")

## Run from Here

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/complete_dataset.csv")

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/features_full_list.csv")
df

Unnamed: 0,Patient ID,Lead ID,PR Ratio,RR Distance,P Energy,T Energy,PQ Distance,QT Interval,ST Slope,PR Slope,QRS Energy,RSQ,Heart Rate (HRS)
0,0,1,0.003317,362.750000,-1487.032658,4866.090041,42.153846,0.374914,2.724239,11.827294,10227.841732,979.097871,0.165403
1,0,2,-0.029219,362.750000,-1827.990737,2519.254928,46.230769,0.379791,2.025500,9.056950,7096.289801,774.923781,0.165403
2,0,3,0.287826,544.000000,689.017622,2233.798276,70.777778,0.141953,0.136529,0.495643,-1191.790501,214.415848,0.110294
3,0,4,0.493832,362.833333,2528.446025,3658.739131,56.615385,0.197590,22.108417,0.979721,-3155.604773,705.472184,0.165365
4,0,5,0.033344,362.750000,-567.678786,3634.570304,38.153846,0.373217,2.044115,7.386252,6170.779988,609.334444,0.165403
...,...,...,...,...,...,...,...,...,...,...,...,...,...
570307,47525,8,0.832771,309.928571,-615.995863,649.065766,57.533333,0.616916,3.484673,0.164939,-1052.721749,395.867634,0.193593
570308,47525,9,0.462888,310.200000,-1552.638686,410.644315,59.562500,0.563749,3.611382,0.563236,-816.523270,411.403809,0.193424
570309,47525,10,0.233699,310.750000,2358.516977,4603.752741,54.000000,0.451596,4.218765,3.601259,2590.356548,678.898562,0.193081
570310,47525,11,0.056936,310.200000,129.063937,1902.409096,57.125000,0.443665,2.467003,5.799644,5176.252797,745.473912,0.193424


In [None]:
# Assuming df is your DataFrame
df.rename(columns={
    'Patient ID': 'patient_id',
    'Lead ID': 'lead_id',
    'PR Ratio': 'pr_ratio',
    'RR Distance': 'rr_distance',
    'P Energy': 'p_energy',
    'T Energy': 't_energy',
    'PQ Distance': 'pq_distance',
    'QT Interval': 'qt_interval',
    'ST Slope': 'st_slope',
    'PR Slope': 'pr_slope',
    'QRS Energy': 'qrs_energy',
    'RSQ': 'rsq',
    'Heart Rate (HRS)': 'heart_rate'
}, inplace=True)

In [None]:
# df.drop(["Unnamed: 0"], axis=1, inplace=True)
df["patient_id"] = df["patient_id"].astype("int")
df["lead_id"] = df["lead_id"].astype("int")

In [None]:
df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate
0,0,1,0.003317,362.750000,-1487.032658,4866.090041,42.153846,0.374914,2.724239,11.827294,10227.841732,979.097871,0.165403
1,0,2,-0.029219,362.750000,-1827.990737,2519.254928,46.230769,0.379791,2.025500,9.056950,7096.289801,774.923781,0.165403
2,0,3,0.287826,544.000000,689.017622,2233.798276,70.777778,0.141953,0.136529,0.495643,-1191.790501,214.415848,0.110294
3,0,4,0.493832,362.833333,2528.446025,3658.739131,56.615385,0.197590,22.108417,0.979721,-3155.604773,705.472184,0.165365
4,0,5,0.033344,362.750000,-567.678786,3634.570304,38.153846,0.373217,2.044115,7.386252,6170.779988,609.334444,0.165403
...,...,...,...,...,...,...,...,...,...,...,...,...,...
570307,47525,8,0.832771,309.928571,-615.995863,649.065766,57.533333,0.616916,3.484673,0.164939,-1052.721749,395.867634,0.193593
570308,47525,9,0.462888,310.200000,-1552.638686,410.644315,59.562500,0.563749,3.611382,0.563236,-816.523270,411.403809,0.193424
570309,47525,10,0.233699,310.750000,2358.516977,4603.752741,54.000000,0.451596,4.218765,3.601259,2590.356548,678.898562,0.193081
570310,47525,11,0.056936,310.200000,129.063937,1902.409096,57.125000,0.443665,2.467003,5.799644,5176.252797,745.473912,0.193424


In [None]:
label_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/label_downsampled.csv")
label_df.rename(columns={
    'id': 'patient_id'
  }, inplace = True
)
label_df

Unnamed: 0,patient_id,Obstructed
0,5,1
1,13,1
2,18,1
3,19,1
4,22,1
...,...,...
21639,22586,0
21640,31037,0
21641,14349,0
21642,45166,0


In [None]:
y = label_df["Obstructed"]
target_df = pd.DataFrame()
target_df["patient_id"] = list(label_df["patient_id"])
target_df["target"] = y
target_df["patient_id"] = target_df["patient_id"].astype("int")
target_df["target"] = target_df["target"].astype("int")

In [None]:
target_df["target"].value_counts()

1    10822
0    10822
Name: target, dtype: int64

In [None]:
# we have 2164 patients
# 20% of 2164 is approximately equals to 433 patients
# randomly generating 433 patients
import random
all_patients = list(label_df["patient_id"])
selected_patients = random.sample(all_patients, 433)

In [None]:
merged_df = pd.merge(df, target_df, on='patient_id', how='inner')

In [None]:
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,target
0,0,1,0.003317,362.750000,-1487.032658,4866.090041,42.153846,0.374914,2.724239,11.827294,10227.841732,979.097871,0.165403,0
1,0,2,-0.029219,362.750000,-1827.990737,2519.254928,46.230769,0.379791,2.025500,9.056950,7096.289801,774.923781,0.165403,0
2,0,3,0.287826,544.000000,689.017622,2233.798276,70.777778,0.141953,0.136529,0.495643,-1191.790501,214.415848,0.110294,0
3,0,4,0.493832,362.833333,2528.446025,3658.739131,56.615385,0.197590,22.108417,0.979721,-3155.604773,705.472184,0.165365,0
4,0,5,0.033344,362.750000,-567.678786,3634.570304,38.153846,0.373217,2.044115,7.386252,6170.779988,609.334444,0.165403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259723,47524,8,0.111678,430.600000,-4708.010207,-3063.668425,28.454545,0.543850,0.563244,2.769434,15809.095292,849.619643,0.139340,0
259724,47524,9,-0.041636,430.600000,-4035.145231,19962.916163,51.272727,0.308238,6.358528,7.913448,6120.676380,811.098478,0.139340,0
259725,47524,10,-0.074027,430.500000,-7557.568669,18712.825254,51.909091,0.321825,7.217530,25.992243,25932.049990,2170.559253,0.139373,0
259726,47524,11,-0.051680,430.600000,-5577.490950,10692.711810,47.545455,0.322805,4.882626,27.577812,26714.108482,2191.407082,0.139340,0


In [None]:
merged_df.to_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")

# Run From Here

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
merged_df.drop("Unnamed: 0", axis=1, inplace=True)
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [None]:
merged_df.head()

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,target
0,31770,7,0.489886,408.636364,-1555.769978,1310.823075,26.5,0.322618,3.223835,2.561711,2609.042727,153.029799,0.14683,0
1,17647,9,0.425142,304.5,-15601.454263,-18343.203043,50.333333,0.860427,2.061886,2.783323,21713.701376,797.639062,0.197044,0
2,9027,5,-0.024703,428.2,-1248.684162,-658.906301,44.363636,0.647319,0.869945,7.915466,19742.347798,847.189022,0.140121,1
3,40588,3,-0.413965,437.454545,-846.773679,4488.792182,81.0,0.269552,1.113399,1.796548,-5641.960259,1070.401507,0.137157,0
4,38755,7,-0.585446,426.363636,-9227.368443,-38.991778,70.25,0.291613,0.825473,0.420236,-1662.990885,304.706473,0.140725,0


In [None]:
merged_df.isna().sum()

patient_id         0
lead_id            0
pr_ratio        4692
rr_distance    10280
p_energy        4692
t_energy        4692
pq_distance        0
qt_interval    10280
st_slope        4692
pr_slope        4692
qrs_energy      4692
rsq             4692
heart_rate     10280
target             0
dtype: int64

In [None]:
merged_df.dropna(inplace=True)
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,target
0,31770,7,0.489886,408.636364,-1555.769978,1310.823075,26.500000,0.322618,3.223835,2.561711,2609.042727,153.029799,0.146830,0
1,17647,9,0.425142,304.500000,-15601.454263,-18343.203043,50.333333,0.860427,2.061886,2.783323,21713.701376,797.639062,0.197044,0
2,9027,5,-0.024703,428.200000,-1248.684162,-658.906301,44.363636,0.647319,0.869945,7.915466,19742.347798,847.189022,0.140121,1
3,40588,3,-0.413965,437.454545,-846.773679,4488.792182,81.000000,0.269552,1.113399,1.796548,-5641.960259,1070.401507,0.137157,0
4,38755,7,-0.585446,426.363636,-9227.368443,-38.991778,70.250000,0.291613,0.825473,0.420236,-1662.990885,304.706473,0.140725,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259723,5886,10,0.683731,498.222222,-5141.421529,13751.853560,56.100000,0.214563,2.065833,0.666093,4812.003225,9.386853,0.120428,1
259724,13133,1,0.221189,317.066667,647.189823,1084.190674,47.750000,0.391282,2.214913,1.021584,148.808936,257.244810,0.189235,1
259725,24115,9,-0.068022,319.928571,-1833.775275,334.188177,76.533333,0.409050,5.663669,4.831058,823.780915,1080.825610,0.187542,1
259726,14652,5,0.226930,353.545455,425.936012,884.498837,40.916667,0.428752,2.258475,2.290105,408.653367,370.141722,0.169709,1


In [None]:
train_data = merged_df[~merged_df['patient_id'].isin(selected_patients)]
test_data = merged_df[merged_df['patient_id'].isin(selected_patients)]

to_drop = ["patient_id", "lead_id"]
train_data.drop(to_drop, axis=1, inplace=True)
# test_data.drop(to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(to_drop, axis=1, inplace=True)


In [None]:
train_data = train_data.sample(frac=1).reset_index(drop=True)

In [None]:
X_train = train_data.drop("target", axis=1)
y_train = train_data.target
X_test = test_data.drop("target", axis=1)
y_test = test_data.target

In [None]:
len(X_train), len(X_test)

(19988, 4986)

In [None]:
columns_to_train = X_train.columns

## Using Random Search to find the best params to train the model

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC

# For Logistic Regression
logistic_regression_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],              # Regularization type
}

# For Decision Tree
decision_tree_params = {
    'criterion': ['gini', 'entropy'],       # Split criterion
    'max_depth': [None, 10, 20, 30, 40],    # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],        # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]           # Minimum samples required at a leaf node
}

# For Random Forest
random_forest_params = {
    'n_estimators': [100, 200, 300],         # Number of trees in the forest
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# For XGBoost
xgboost_params = {
    'learning_rate': [0.01, 0.1, 0.2],     # Step size shrinkage to prevent overfitting
    'max_depth': [3, 4, 5, 6],            # Maximum depth of the tree
    'subsample': [0.8, 0.9, 1.0],         # Fraction of samples used for training
    'colsample_bytree': [0.8, 0.9, 1.0],  # Fraction of features used for training
}

# # For SVM
# svm_params = {
#     'C': [0.1, 1, 10],                 # Regularization parameter
#     'kernel': ['linear', 'rbf'],       # Kernel type
#     'gamma': ['scale', 'auto']         # Kernel coefficient (scale or auto)
# }

models = {
    "Logistic Regression": (LogisticRegression(), logistic_regression_params),
    "Decision Tree": (DecisionTreeClassifier(), decision_tree_params),
    "Random Forest": (RandomForestClassifier(), random_forest_params),
    "XGBoost": (xgb.XGBClassifier(), xgboost_params),
    "SVM": (SVC(), svm_params),
}

best_models = {}
best_params = {}

for model_name, (model, params) in models.items():
    print(f"Searching for the best hyperparameters for {model_name}...")

    random_search = RandomizedSearchCV(
        model,
        params,
        n_iter=10,       # Number of random combinations to try
        cv=5,            # Number of cross-validation folds
        n_jobs=-1        # Use all available CPU cores
    )

    random_search.fit(X_train, y_train)

    best_models[model_name] = random_search.best_estimator_
    best_params[model_name] = random_search.best_params_

    print(f"Best hyperparameters for {model_name}:")
    print(random_search.best_params_)
    print()

Searching for the best hyperparameters for Logistic Regression...


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.52045375        nan 0.52027931     

Best hyperparameters for Logistic Regression:
{'penalty': 'l2', 'C': 1}

Searching for the best hyperparameters for Decision Tree...
Best hyperparameters for Decision Tree:
{'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 40, 'criterion': 'entropy'}

Searching for the best hyperparameters for Random Forest...
Best hyperparameters for Random Forest:
{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'criterion': 'gini'}

Searching for the best hyperparameters for XGBoost...
Best hyperparameters for XGBoost:
{'subsample': 0.8, 'max_depth': 6, 'learning_rate': 0.2, 'colsample_bytree': 1.0}

Searching for the best hyperparameters for SVM...


KeyboardInterrupt: ignored

## Logistic Regression

In [None]:
logress_model = LogisticRegression(C=1,penalty='l2')
logress_model.fit(X_train, y_train)

In [None]:
log_pred = []
log_actual = []
for pt in selected_patients:
  smp = X_test[X_test["patient_id"] == pt]
  ones = 0
  zeros = 0
  for index, row in smp.iterrows():
    current_row_features = row[columns_to_train]
    prediction = logress_model.predict([current_row_features])[0]
    if ones == 0 and zeros == 0:
      first_lead = prediction
    if prediction == 1:
      ones += 1
    else:
      zeros += 1
  if ones == zeros:
    log_pred.append(first_lead)
  else:
    if ones > zeros:
      log_pred.append(1)
    else:
      log_pred.append(0)
  log_actual.append(test_data[test_data["patient_id"]==pt]["target"].iloc[0])

In [None]:
print("Testing Performance")
accuracy = accuracy_score(log_actual, log_pred)
f1 = f1_score(log_actual, log_pred)
precision = precision_score(log_actual, log_pred)
recall = recall_score(log_actual, log_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5011547344110855
F1 Score: 0.5909090909090909
Precision: 0.46846846846846846
Recall: 0.8


## Decision Trees

In [None]:
# modeling
decision_tree_model = DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=1, max_depth=40, criterion='entropy')
decision_tree_model.fit(X_train, y_train)

In [None]:
# modeling
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

In [None]:
dt_pred = []
dt_actual = []
for pt in selected_patients:
  smp = X_test[X_test["patient_id"] == pt]
  ones = 0
  zeros = 0
  for index, row in smp.iterrows():
    current_row_features = row[columns_to_train]
    prediction = decision_tree_model.predict([current_row_features])[0]
    if ones == 0 and zeros == 0:
      first_lead = prediction
    if prediction == 1:
      ones += 1
    else:
      zeros += 1
  if ones == zeros:
    dt_pred.append(first_lead)
  else:
    if ones > zeros:
      dt_pred.append(1)
    else:
      dt_pred.append(0)
  dt_actual.append(test_data[test_data["patient_id"]==pt]["target"].iloc[0])

In [None]:
print("Testing Performance")
accuracy = accuracy_score(dt_actual, dt_pred)
f1 = f1_score(dt_actual, dt_pred)
precision = precision_score(dt_actual, dt_pred)
recall = recall_score(dt_actual, dt_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5196304849884527
F1 Score: 0.4975845410628019
Precision: 0.4703196347031963
Recall: 0.5282051282051282


## Random Forest

In [None]:
random_forest_model = RandomForestClassifier(n_estimators= 300, min_samples_split= 5, min_samples_leaf= 2, max_depth= None, criterion='gini')
random_forest_model.fit(X_train, y_train)

In [None]:
rf_pred = []
rf_actual = []
for pt in selected_patients:
  smp = X_test[X_test["patient_id"] == pt]
  ones = 0
  zeros = 0
  for index, row in smp.iterrows():
    current_row_features = row[columns_to_train]
    prediction = random_forest_model.predict([current_row_features])[0]
    if ones == 0 and zeros == 0:
      first_lead = prediction
    if prediction == 1:
      ones += 1
    else:
      zeros += 1
  if ones == zeros:
    rf_pred.append(first_lead)
  else:
    if ones > zeros:
      rf_pred.append(1)
    else:
      rf_pred.append(0)
  rf_actual.append(test_data[test_data["patient_id"]==pt]["target"].iloc[0])

In [None]:
print("Testing Performance")
accuracy = accuracy_score(rf_actual, rf_pred)
f1 = f1_score(rf_actual, rf_pred)
precision = precision_score(rf_actual, rf_pred)
recall = recall_score(rf_actual, rf_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5080831408775982
F1 Score: 0.5058004640371229
Precision: 0.461864406779661
Recall: 0.558974358974359


## XGBoost

In [None]:
# modeling
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)
xgboost_model.fit(X_train, y_train)

In [None]:
xgb_pred = []
xgb_actual = []
for pt in selected_patients:
  smp = X_test[X_test["patient_id"] == pt]
  ones = 0
  zeros = 0
  for index, row in smp.iterrows():
    current_row_features = row[columns_to_train]
    prediction = xgboost_model.predict([current_row_features])[0]
    if ones == 0 and zeros == 0:
      first_lead = prediction
    if prediction == 1:
      ones += 1
    else:
      zeros += 1
  if ones == zeros:
    xgb_pred.append(first_lead)
  else:
    if ones > zeros:
      xgb_pred.append(1)
    else:
      xgb_pred.append(0)
  xgb_actual.append(test_data[test_data["patient_id"]==pt]["target"].iloc[0])

In [None]:
print("Testing Performance")
accuracy = accuracy_score(xgb_actual, xgb_pred)
f1 = f1_score(xgb_actual, xgb_pred)
precision = precision_score(xgb_actual, xgb_pred)
recall = recall_score(xgb_actual, xgb_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5496535796766744
F1 Score: 0.5496535796766744
Precision: 0.5409090909090909
Recall: 0.5586854460093896


## SVM

In [None]:
# modeling
svm_model = SVC(kernel= 'rbf', gamma= 'auto', C= 1)
svm_model.fit(X_train, y_train)

In [None]:
svm_pred = []
svm_actual = []
for pt in selected_patients:
  smp = X_test[X_test["patient_id"] == pt]
  ones = 0
  zeros = 0
  for index, row in smp.iterrows():
    current_row_features = row[columns_to_train]
    prediction = svm_model.predict([current_row_features])[0]
    if ones == 0 and zeros == 0:
      first_lead = prediction
    if prediction == 1:
      ones += 1
    else:
      zeros += 1
  if ones == zeros:
    svm_pred.append(first_lead)
  else:
    if ones > zeros:
      svm_pred.append(1)
    else:
      svm_pred.append(0)
  svm_actual.append(test_data[test_data["patient_id"]==pt]["target"].iloc[0])

In [None]:
print("Testing Performance")
accuracy = accuracy_score(svm_actual, svm_pred)
f1 = f1_score(svm_actual, svm_pred)
precision = precision_score(svm_actual, svm_pred)
recall = recall_score(svm_actual, svm_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.49191685912240185
F1 Score: 0.6594427244582043
Precision: 0.49191685912240185
Recall: 1.0


# Training and Testing on Complete Data

In [None]:
X = merged_df.drop("target", axis=1)
y = merged_df.target
X.drop(["patient_id", "lead_id"], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(19979, 4995)

## Logistic Regression

In [None]:
logress_model = LogisticRegression()
logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5277277277277277
F1 Score: 0.5288595965648094
Precision: 0.5336557839580814
Recall: 0.5241488519398259


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.606006006006006
F1 Score: 0.6065573770491803
Precision: 0.6126817447495961
Recall: 0.60055423594616


## Random Forest

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.648048048048048
F1 Score: 0.6400491400491399
Precision: 0.6628498727735369
Recall: 0.6187648456057007


## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)
xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6088088088088088
F1 Score: 0.6049332794177114
Precision: 0.6181818181818182
Recall: 0.5922406967537609


## SVM

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5541541541541541
F1 Score: 0.49740464906341675
Precision: 0.5784776902887139
Recall: 0.43626286619160726


## Standard Normalization

In [None]:
scaler = StandardScaler()

In [None]:
X = merged_df.drop("target", axis=1)
y = merged_df.target
X.drop(["patient_id", "lead_id"], axis=1, inplace=True)

# Correctly scale the features using MinMaxScaler
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

In [None]:
logress_model = LogisticRegression()
logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

print("Testing Performance")
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5245245245245245
F1 Score: 0.47767758961952933
Precision: 0.537357743691242
Recall: 0.42992874109263657


## Decision Tree

In [None]:
# modeling
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

# predictions
y_pred = decision_tree_model.predict(X_test)
y_train_pred = decision_tree_model.predict(X_train)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.607007007007007
F1 Score: 0.6088862323171946
Precision: 0.6129161652627356
Recall: 0.6049089469517023


## Random Forest

In [None]:
# modeling
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

# predictions
y_pred = random_forest_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.6616616616616616
F1 Score: 0.6515463917525773
Precision: 0.6798623063683304
Recall: 0.6254948535233571


## XGBoost

In [None]:
# modeling
xgboost_model = xgb.XGBClassifier()
xgboost_model.fit(X_train, y_train)

# predictions
y_pred = xgboost_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.624024024024024
F1 Score: 0.6221327967806841
Precision: 0.632569558101473
Recall: 0.6120348376880443


## SVM

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5541541541541541
F1 Score: 0.49740464906341675
Precision: 0.5784776902887139
Recall: 0.43626286619160726


## Minmax Normalization

In [None]:
scaler = MinMaxScaler()

In [None]:
X = merged_df.drop("target", axis=1)
y = merged_df.target
X.drop(["patient_id", "lead_id"], axis=1, inplace=True)

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

In [None]:
logress_model = LogisticRegression()
logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

print("Testing Performance")
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5255255255255256
F1 Score: 0.493373236425823
Precision: 0.5362453531598513
Recall: 0.4568487727632621


## Decision Tree

In [None]:
# modeling
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

# predictions
y_pred = decision_tree_model.predict(X_test)
y_train_pred = decision_tree_model.predict(X_train)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.61001001001001
F1 Score: 0.6141045958795562
Precision: 0.6145915939730373
Recall: 0.613618368962787


## Random Forest

In [None]:
# modeling
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

# predictions
y_pred = random_forest_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.6474474474474474
F1 Score: 0.6393610485357363
Precision: 0.6622825625795503
Recall: 0.6179730799683294


## XGBoost

In [None]:
# modeling
xgboost_model = xgb.XGBClassifier()
xgboost_model.fit(X_train, y_train)

# predictions
y_pred = xgboost_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.6044044044044043
F1 Score: 0.6020942408376964
Precision: 0.6127049180327869
Recall: 0.5918448139350753


## SVM

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5349349349349349
F1 Score: 0.480661748267382
Precision: 0.5521314843348741
Recall: 0.4255740300870942


# Models on all data using Standard Normalization

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
merged_df.drop("Unnamed: 0", axis=1, inplace=True)
merged_df.dropna(inplace=True)

In [None]:
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,target
0,0,1,0.003317,362.750000,-1487.032658,4866.090041,42.153846,0.374914,2.724239,11.827294,10227.841732,979.097871,0.165403,0
1,0,2,-0.029219,362.750000,-1827.990737,2519.254928,46.230769,0.379791,2.025500,9.056950,7096.289801,774.923781,0.165403,0
2,0,3,0.287826,544.000000,689.017622,2233.798276,70.777778,0.141953,0.136529,0.495643,-1191.790501,214.415848,0.110294,0
3,0,4,0.493832,362.833333,2528.446025,3658.739131,56.615385,0.197590,22.108417,0.979721,-3155.604773,705.472184,0.165365,0
4,0,5,0.033344,362.750000,-567.678786,3634.570304,38.153846,0.373217,2.044115,7.386252,6170.779988,609.334444,0.165403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259723,47524,8,0.111678,430.600000,-4708.010207,-3063.668425,28.454545,0.543850,0.563244,2.769434,15809.095292,849.619643,0.139340,0
259724,47524,9,-0.041636,430.600000,-4035.145231,19962.916163,51.272727,0.308238,6.358528,7.913448,6120.676380,811.098478,0.139340,0
259725,47524,10,-0.074027,430.500000,-7557.568669,18712.825254,51.909091,0.321825,7.217530,25.992243,25932.049990,2170.559253,0.139373,0
259726,47524,11,-0.051680,430.600000,-5577.490950,10692.711810,47.545455,0.322805,4.882626,27.577812,26714.108482,2191.407082,0.139340,0


In [None]:
merged_df["target"].value_counts()

0    125209
1    124239
Name: target, dtype: int64

In [None]:
# merged_df = merged_df[merged_df["lead_id"] == 1]
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

X = merged_df.drop(["patient_id", "lead_id", "target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate
0,-0.021959,-0.632198,0.030636,0.391091,-0.323556,0.072251,0.204661,-0.084032,-0.332012,-0.157438,1.277397
1,-0.034509,-0.145963,-0.625410,-1.118367,0.051764,0.077811,-0.805693,0.795716,1.326562,-0.069169,-0.026489
2,-0.015083,-0.631458,0.619298,-0.534475,2.211873,2.101917,-0.597180,-0.136907,0.390455,-0.087491,1.274814
3,-0.036482,0.018358,-1.138662,2.313572,1.089435,-0.391232,1.610026,0.969630,-0.128147,1.875189,-0.339754
4,0.015563,-0.024404,0.846229,0.058588,-1.066170,-0.193872,-0.401265,-0.338727,-0.156150,-0.572025,-0.262720
...,...,...,...,...,...,...,...,...,...,...,...
249443,-0.025302,0.092777,0.252038,-0.440426,0.378544,-0.004996,-0.337793,0.400275,0.272237,-0.092808,-0.467061
249444,-0.008748,-0.218557,-0.114648,0.381037,0.452117,0.108601,-0.336941,-0.613323,-0.050839,-0.750718,0.128191
249445,-0.023831,-0.074319,0.191360,-0.224785,0.189355,0.186193,0.014364,0.117104,-0.046560,0.228276,-0.168938
249446,-0.010359,-0.118576,0.796083,-0.612869,-0.045697,0.469236,0.562702,0.113323,-0.538718,0.965054,-0.082082


# K-Fold

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split

## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(199558, 49890)

In [None]:
y.value_counts()

0    125209
1    124239
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

Cross-Validation Scores: [0.53359892 0.53106835 0.53207056 0.52722307 0.53108166]
Mean CV Accuracy: 0.5310


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5304670274604129
F1 Score: 0.5003519399355844
Precision: 0.5297651309846432
Recall: 0.47403305985531263


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

KeyboardInterrupt: ignored

In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

Cross-Validation Scores: [0.61722488 0.62260766 0.61064593 0.61363636 0.61812201]
Mean CV Accuracy: 0.6164


In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6142071274814638
F1 Score: 0.6127250900360144
Precision: 0.610818573480134
Recall: 0.6146435452793835


## SVM

In [None]:
svm_model = SVC()

k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

Cross-Validation Scores: [0.55419423 0.55028563 0.5575015  0.55250432 0.55012403]
Mean CV Accuracy: 0.5529


In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5542794147123672
F1 Score: 0.5376057890249735
Precision: 0.5572222940643994
Recall: 0.5193234774224651


## Naive Bayes

In [None]:
# Create Naive Bayes model
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

Cross-Validation Scores: [0.50939567 0.50298156 0.50884446 0.50580041 0.50610107]
Mean CV Accuracy: 0.5066


In [None]:
# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Naive Bayes Accuracy: 0.5065143315293646
Naive Bayes F1 Score: 0.1273835684411994
Naive Bayes Precision: 0.5177182368193605
Naive Bayes Recall: 0.07262660146304005


## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

Cross-Validation Scores: [0.53462618 0.53257166 0.53079274 0.5297537  0.53243467]
Mean CV Accuracy: 0.5320
KNN Accuracy: 0.538264181198637
KNN F1 Score: 0.532624573932803
KNN Precision: 0.5347728661641882
KNN Recall: 0.5304934729014267


# Models on only Lead 1 using Standard Normalization

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
merged_df.drop("Unnamed: 0", axis=1, inplace=True)
merged_df.dropna(inplace=True)

In [None]:
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,target
0,0,1,0.003317,362.750000,-1487.032658,4866.090041,42.153846,0.374914,2.724239,11.827294,10227.841732,979.097871,0.165403,0
1,0,2,-0.029219,362.750000,-1827.990737,2519.254928,46.230769,0.379791,2.025500,9.056950,7096.289801,774.923781,0.165403,0
2,0,3,0.287826,544.000000,689.017622,2233.798276,70.777778,0.141953,0.136529,0.495643,-1191.790501,214.415848,0.110294,0
3,0,4,0.493832,362.833333,2528.446025,3658.739131,56.615385,0.197590,22.108417,0.979721,-3155.604773,705.472184,0.165365,0
4,0,5,0.033344,362.750000,-567.678786,3634.570304,38.153846,0.373217,2.044115,7.386252,6170.779988,609.334444,0.165403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259723,47524,8,0.111678,430.600000,-4708.010207,-3063.668425,28.454545,0.543850,0.563244,2.769434,15809.095292,849.619643,0.139340,0
259724,47524,9,-0.041636,430.600000,-4035.145231,19962.916163,51.272727,0.308238,6.358528,7.913448,6120.676380,811.098478,0.139340,0
259725,47524,10,-0.074027,430.500000,-7557.568669,18712.825254,51.909091,0.321825,7.217530,25.992243,25932.049990,2170.559253,0.139373,0
259726,47524,11,-0.051680,430.600000,-5577.490950,10692.711810,47.545455,0.322805,4.882626,27.577812,26714.108482,2191.407082,0.139340,0


In [None]:
merged_df["target"].value_counts()

0    125209
1    124239
Name: target, dtype: int64

In [None]:
merged_df = merged_df[merged_df["lead_id"] == 1]
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

X = merged_df.drop(["patient_id", "lead_id", "target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate
0,-0.179562,0.103146,-1.305091,0.779974,-1.596412,-0.577864,-0.013399,2.485075,0.976448,0.783038,-0.482557
1,-0.188358,-0.248703,-0.553777,-0.291508,0.484000,0.107614,-0.366425,-0.465152,-0.560475,-0.658229,0.235811
2,-0.168633,-0.062389,-0.882314,1.374243,0.473725,-0.387032,0.936605,1.377436,0.984808,1.788023,-0.170582
3,-0.194070,1.070046,-0.146856,-0.648573,0.148325,-0.903802,-1.019622,-1.006021,-0.609826,-1.340604,-1.724283
4,-0.020590,0.338979,0.120028,1.618830,-0.172336,-0.198012,0.199238,0.708614,0.569698,0.775007,-0.863839
...,...,...,...,...,...,...,...,...,...,...,...
20896,-0.076170,-0.118846,-0.181683,-1.149890,0.006484,0.221341,-0.996654,0.157611,0.386078,-0.183388,-0.054196
20897,0.149971,-0.263153,-0.591277,-0.757209,0.085439,-0.272219,-1.254914,-0.327532,0.588491,-0.553824,0.270180
20898,-0.206109,-0.282587,-1.062786,2.177935,2.494650,-0.337626,1.506141,0.214523,0.585417,1.172471,0.317112
20899,-0.071387,-0.636201,-0.601570,0.142946,-1.292272,0.368314,-0.017484,0.549049,-0.062587,-0.120235,1.343427


# K-Fold

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split

## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(16720, 4181)

In [None]:
y.value_counts()

0    10487
1    10414
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

Cross-Validation Scores: [0.62200957 0.6312799  0.63456938 0.63277512 0.63696172]
Mean CV Accuracy: 0.6315


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6331021286773499
F1 Score: 0.6465437788018432
Precision: 0.6196996466431095
Recall: 0.6758188824662813


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

Cross-Validation Scores: [0.55801435 0.55173445 0.5583134  0.56130383 0.56399522]
Mean CV Accuracy: 0.5587


In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5654149724946185
F1 Score: 0.5607928450568045
Precision: 0.5628335759340126
Recall: 0.558766859344894


## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

Cross-Validation Scores: [0.63277512 0.62649522 0.61483254 0.61692584 0.62978469]
Mean CV Accuracy: 0.6242


In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6125328868691701
F1 Score: 0.6062226543509965
Precision: 0.611874386653582
Recall: 0.600674373795761


## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

Cross-Validation Scores: [0.61722488 0.62260766 0.61064593 0.61363636 0.61812201]
Mean CV Accuracy: 0.6164


In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6142071274814638
F1 Score: 0.6127250900360144
Precision: 0.610818573480134
Recall: 0.6146435452793835


## SVM

In [None]:
svm_model = SVC()

k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

Cross-Validation Scores: [0.55419423 0.55028563 0.5575015  0.55250432 0.55012403]
Mean CV Accuracy: 0.5529


In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.5542794147123672
F1 Score: 0.5376057890249735
Precision: 0.5572222940643994
Recall: 0.5193234774224651


## Naive Bayes

In [None]:
# Create Naive Bayes model
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

Cross-Validation Scores: [0.61752392 0.62799043 0.60675837 0.61004785 0.63187799]
Mean CV Accuracy: 0.6188


In [None]:
# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Naive Bayes Accuracy: 0.6142071274814638
Naive Bayes F1 Score: 0.5767515087903439
Naive Bayes Precision: 0.6334293948126801
Naive Bayes Recall: 0.529383429672447


## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

Cross-Validation Scores: [0.59748804 0.5882177  0.58313397 0.58492823 0.59300239]
Mean CV Accuracy: 0.5894
KNN Accuracy: 0.5852666826118154
KNN F1 Score: 0.5877318116975748
KNN Precision: 0.5802816901408451
KNN Recall: 0.5953757225433526


# Models on only Lead 1 using Minmax Normalization

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
merged_df.drop("Unnamed: 0", axis=1, inplace=True)
merged_df.dropna(inplace=True)

In [None]:
merged_df = merged_df[merged_df["lead_id"] == 1]
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

X = merged_df.drop(["patient_id", "lead_id", "target"], axis=1)
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate
0,0.290659,0.037517,0.427689,0.264342,0.430223,0.096547,0.502070,0.063044,0.175424,0.127022,0.474014
1,0.287616,0.080380,0.361093,0.338398,0.293513,0.065945,0.495871,0.162210,0.227163,0.151043,0.286679
2,0.288044,0.073971,0.384451,0.397976,0.736673,0.088727,0.522997,0.086007,0.181264,0.167651,0.305438
3,0.289138,0.060534,0.420337,0.276854,0.461902,0.092741,0.481098,0.101081,0.230698,0.129709,0.352822
4,0.288779,0.075399,0.400101,0.264198,0.457932,0.084228,0.480883,0.097749,0.239405,0.134714,0.301074
...,...,...,...,...,...,...,...,...,...,...,...
20896,0.287518,0.055912,0.360924,0.374617,0.456647,0.082648,0.522391,0.155248,0.217953,0.207587,0.372308
20897,0.290757,0.052231,0.405962,0.318400,0.348748,0.080723,0.500952,0.050724,0.171047,0.103248,0.389285
20898,0.288564,0.092417,0.419775,0.328252,0.360308,0.054702,0.530226,0.240749,0.283585,0.251422,0.256491
20899,0.287724,0.059457,0.373661,0.319576,0.841303,0.079047,0.521614,0.116826,0.268448,0.211118,0.357195


## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(16720, 4181)

In [None]:
y.value_counts()

0    10487
1    10414
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()
logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6170772542453958
F1 Score: 0.6244428806005161
Precision: 0.6187819618781962
Recall: 0.6302083333333334


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5680459220282229
F1 Score: 0.5708174904942966
Precision: 0.5729961832061069
Recall: 0.568655303030303


## Random Forest

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6187514948576895
F1 Score: 0.61083984375
Precision: 0.6305443548387096
Recall: 0.5923295454545454


## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)
xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6151638364027745
F1 Score: 0.6084205402774396
Precision: 0.6259389083625438
Recall: 0.5918560606060606


## SVM

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, zero_division=1)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Testing Performance
Accuracy: 0.6266443434585027
F1 Score: 0.6193611314313582
Precision: 0.6385118149824032
Recall: 0.6013257575757576


## Naive Bayes

In [None]:
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Naive Bayes Accuracy: 0.5845491509208324
Naive Bayes F1 Score: 0.4133738601823708
Naive Bayes Precision: 0.6954545454545454
Naive Bayes Recall: 0.2940893801057184


## KNN

In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

KNN Accuracy: 0.5845491509208324
KNN F1 Score: 0.5823515268093292
KNN Precision: 0.582771896053898
KNN Recall: 0.5819317635752043


# Models trained on Lead 1+2+3 = 33 features using Standard Normalization

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
merged_df.drop("Unnamed: 0", axis=1, inplace=True)
# merged_df.dropna(inplace=True)

In [None]:
merged_df = merged_df[merged_df["lead_id"] < 4]
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,target
0,0,1,0.003317,362.750000,-1487.032658,4866.090041,42.153846,0.374914,2.724239,11.827294,10227.841732,979.097871,0.165403,0
1,0,2,-0.029219,362.750000,-1827.990737,2519.254928,46.230769,0.379791,2.025500,9.056950,7096.289801,774.923781,0.165403,0
2,0,3,0.287826,544.000000,689.017622,2233.798276,70.777778,0.141953,0.136529,0.495643,-1191.790501,214.415848,0.110294,0
12,4,1,-0.091575,332.000000,-3999.382812,-2061.834799,71.333333,0.532731,1.491566,8.076431,20535.170781,1033.509926,0.180723,0
13,4,2,0.296147,331.769231,-7167.827790,1566.689852,36.928571,0.816402,-1.145948,2.028150,10852.507374,968.754767,0.180849,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259705,47521,2,0.128247,459.000000,370.909700,2833.342624,58.818182,0.509606,-0.197633,1.995816,6469.176096,348.359882,0.130719,1
259706,47521,3,0.399139,417.181818,-55.789648,1504.684087,61.500000,0.597461,0.274089,0.959021,4836.448568,76.747656,0.143822,1
259716,47524,1,-0.015424,430.600000,-1914.514844,6371.740016,45.818182,0.320272,3.076070,15.075074,13590.186861,1245.427496,0.139340,0
259717,47524,2,-0.001471,430.600000,-2186.688464,7622.288910,49.363636,0.327028,3.367934,13.725599,15021.787317,1211.220556,0.139340,0


In [None]:
merged_df.columns

Index(['patient_id', 'lead_id', 'pr_ratio', 'rr_distance', 'p_energy',
       't_energy', 'pq_distance', 'qt_interval', 'st_slope', 'pr_slope',
       'qrs_energy', 'rsq', 'heart_rate', 'target'],
      dtype='object')

In [None]:
pivoted_df = merged_df.pivot(index='patient_id', columns='lead_id',\
                             values=['pr_ratio', 'rr_distance', 'p_energy',\
                                     't_energy', 'pq_distance', 'qt_interval',\
                                     'st_slope', 'pr_slope', 'qrs_energy',\
                                     'rsq', 'heart_rate'])
pivoted_df.sort_index(axis=1, level=1, inplace=True)


In [None]:
pivoted_df

Unnamed: 0_level_0,heart_rate,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,...,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,t_energy
lead_id,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.165403,-1487.032658,42.153846,0.003317,11.827294,10227.841732,0.374914,362.750000,979.097871,2.724239,...,689.017622,70.777778,0.287826,0.495643,-1191.790501,0.141953,544.000000,214.415848,0.136529,2233.798276
4,0.180723,-3999.382812,71.333333,-0.091575,8.076431,20535.170781,0.532731,332.000000,1033.509926,1.491566,...,-5940.218272,41.857143,0.475299,1.916687,18051.306553,0.967074,332.076923,1747.880389,1.438517,-576.275159
5,0.200624,36.091650,50.375000,0.108335,3.502207,3550.596526,0.415250,299.066667,372.220884,0.872515,...,3894.547168,50.000000,0.683585,1.217461,2371.776144,0.760948,299.133333,447.791532,0.995743,3189.392215
7,0.157446,-162.183585,50.307692,0.063845,7.744824,8720.964097,0.372824,381.083333,784.383705,1.223054,...,3875.693810,29.538462,0.702479,1.547171,1699.281508,0.397537,381.000000,-7.480529,0.567869,-681.318716
8,0.141358,1080.434961,45.166667,0.050815,12.549471,18074.595536,0.510459,424.454545,1006.832571,1.215399,...,2092.451116,37.923077,0.564450,0.993899,3826.855537,0.223501,388.916667,-63.982332,-1.057665,3874.589784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47511,0.116631,1785.341462,64.400000,0.364875,2.062872,-1548.806607,0.330842,514.444444,875.351027,6.380494,...,-1469.583962,84.900000,0.060790,4.420605,8282.794040,0.139762,514.444444,265.329397,-13.107481,5718.991161
47515,0.149524,-493.991546,49.083333,0.026206,4.726638,2053.999423,0.348682,401.272727,650.805283,4.243753,...,1117.181138,61.583333,0.290929,0.797010,1669.768964,0.462247,393.909091,225.186551,1.489929,1024.368276
47520,0.176248,-75.453028,46.266667,0.063902,9.228074,8703.798854,0.364247,340.428571,850.242574,1.664502,...,1017.120894,56.142857,0.770993,0.181743,-107.816861,0.378365,366.615385,151.022513,1.692909,-15.382207
47521,0.156658,-165.705194,87.307692,-0.014284,1.445051,393.744334,0.374172,383.000000,351.533843,3.065497,...,-55.789648,61.500000,0.399139,0.959021,4836.448568,0.597461,417.181818,76.747656,0.274089,1504.684087


In [None]:
pivoted_df = pivoted_df.reset_index()
pivoted_df

Unnamed: 0_level_0,patient_id,heart_rate,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,...,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,t_energy
lead_id,Unnamed: 1_level_1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
0,0,0.165403,-1487.032658,42.153846,0.003317,11.827294,10227.841732,0.374914,362.750000,979.097871,...,689.017622,70.777778,0.287826,0.495643,-1191.790501,0.141953,544.000000,214.415848,0.136529,2233.798276
1,4,0.180723,-3999.382812,71.333333,-0.091575,8.076431,20535.170781,0.532731,332.000000,1033.509926,...,-5940.218272,41.857143,0.475299,1.916687,18051.306553,0.967074,332.076923,1747.880389,1.438517,-576.275159
2,5,0.200624,36.091650,50.375000,0.108335,3.502207,3550.596526,0.415250,299.066667,372.220884,...,3894.547168,50.000000,0.683585,1.217461,2371.776144,0.760948,299.133333,447.791532,0.995743,3189.392215
3,7,0.157446,-162.183585,50.307692,0.063845,7.744824,8720.964097,0.372824,381.083333,784.383705,...,3875.693810,29.538462,0.702479,1.547171,1699.281508,0.397537,381.000000,-7.480529,0.567869,-681.318716
4,8,0.141358,1080.434961,45.166667,0.050815,12.549471,18074.595536,0.510459,424.454545,1006.832571,...,2092.451116,37.923077,0.564450,0.993899,3826.855537,0.223501,388.916667,-63.982332,-1.057665,3874.589784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21639,47511,0.116631,1785.341462,64.400000,0.364875,2.062872,-1548.806607,0.330842,514.444444,875.351027,...,-1469.583962,84.900000,0.060790,4.420605,8282.794040,0.139762,514.444444,265.329397,-13.107481,5718.991161
21640,47515,0.149524,-493.991546,49.083333,0.026206,4.726638,2053.999423,0.348682,401.272727,650.805283,...,1117.181138,61.583333,0.290929,0.797010,1669.768964,0.462247,393.909091,225.186551,1.489929,1024.368276
21641,47520,0.176248,-75.453028,46.266667,0.063902,9.228074,8703.798854,0.364247,340.428571,850.242574,...,1017.120894,56.142857,0.770993,0.181743,-107.816861,0.378365,366.615385,151.022513,1.692909,-15.382207
21642,47521,0.156658,-165.705194,87.307692,-0.014284,1.445051,393.744334,0.374172,383.000000,351.533843,...,-55.789648,61.500000,0.399139,0.959021,4836.448568,0.597461,417.181818,76.747656,0.274089,1504.684087


In [None]:
merged_df = pd.merge(pivoted_df, target_df, on='patient_id', how='inner')

  merged_df = pd.merge(pivoted_df, target_df, on='patient_id', how='inner')


In [None]:
merged_df.dropna(inplace=True)

In [None]:
merged_df.columns

Index([      'patient_id', ('patient_id', ''),  ('heart_rate', 1),
          ('p_energy', 1), ('pq_distance', 1),    ('pr_ratio', 1),
          ('pr_slope', 1),  ('qrs_energy', 1), ('qt_interval', 1),
       ('rr_distance', 1),         ('rsq', 1),    ('st_slope', 1),
          ('t_energy', 1),  ('heart_rate', 2),    ('p_energy', 2),
       ('pq_distance', 2),    ('pr_ratio', 2),    ('pr_slope', 2),
        ('qrs_energy', 2), ('qt_interval', 2), ('rr_distance', 2),
               ('rsq', 2),    ('st_slope', 2),    ('t_energy', 2),
        ('heart_rate', 3),    ('p_energy', 3), ('pq_distance', 3),
          ('pr_ratio', 3),    ('pr_slope', 3),  ('qrs_energy', 3),
       ('qt_interval', 3), ('rr_distance', 3),         ('rsq', 3),
          ('st_slope', 3),    ('t_energy', 3),           'target'],
      dtype='object')

In [None]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

merged_df = merged_df.iloc[:,2:]

X = merged_df.drop(["target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,"(heart_rate, 1)","(p_energy, 1)","(pq_distance, 1)","(pr_ratio, 1)","(pr_slope, 1)","(qrs_energy, 1)","(qt_interval, 1)","(rr_distance, 1)","(rsq, 1)","(st_slope, 1)",...,"(p_energy, 3)","(pq_distance, 3)","(pr_ratio, 3)","(pr_slope, 3)","(qrs_energy, 3)","(qt_interval, 3)","(rr_distance, 3)","(rsq, 3)","(st_slope, 3)","(t_energy, 3)"
0,0.523034,-0.812381,0.981284,-0.331000,-0.992322,-0.757664,0.423502,-0.387831,-1.158431,-0.649839,...,-0.152505,1.069540,-0.014506,-0.211816,-0.094397,-0.319282,-0.153809,-0.358222,-0.239254,0.358891
1,0.834087,-1.135405,-0.753005,-0.172118,3.292096,2.346079,0.437364,-0.511423,2.580251,0.164160,...,1.207782,-0.967424,0.000886,-0.491058,-1.190964,-0.135046,-0.523218,1.065033,2.698497,-0.006835
2,-0.732273,-1.028199,-0.259538,-0.162750,0.106277,0.943999,-0.343269,0.288604,-0.018618,-0.369930,...,0.717678,-0.437923,0.001761,-0.471759,-0.499012,-1.101995,-0.021021,-0.036447,-0.130953,1.238337
3,0.569162,-2.465627,1.948140,0.432739,-1.472063,-0.521674,0.897072,-0.407012,-2.093720,-2.725149,...,-0.322425,-0.444403,-0.014901,0.511619,0.378248,-0.038079,-0.394073,-0.033298,-0.288029,-0.161370
4,-0.006519,2.624326,0.345757,0.161518,0.507894,0.337917,0.539059,-0.142899,1.693233,0.777469,...,-0.443945,-0.385495,-0.009691,-0.240348,-0.803253,-0.459188,-0.308636,0.662329,1.902295,-0.505850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19941,-1.258109,-0.408785,-1.547640,-0.083777,2.800708,1.131607,-0.905737,0.702313,1.331923,-0.226370,...,-0.822762,-0.890797,-0.019003,0.086359,-0.003667,-0.654644,0.239323,-0.701542,-0.324786,-0.049244
19942,-1.251237,-0.846818,2.862580,-0.194453,0.297878,0.645408,-1.098462,0.696183,1.467213,0.460800,...,-0.119009,-0.768398,-0.015354,-0.273343,-0.713471,-0.818583,0.251840,-0.044645,0.046525,-0.501483
19943,0.196117,0.303844,-0.544414,0.933407,-1.572812,-0.690296,1.216155,-0.242377,-2.169598,-1.243995,...,0.467919,2.066563,-0.010179,-0.534902,-0.888706,-0.371126,-0.355238,0.992161,-0.509347,3.233598
19944,2.241309,1.940915,-0.559489,0.300001,-0.512338,-0.842442,1.172805,-0.940218,0.189226,0.768428,...,-0.277874,-0.282405,-0.014400,0.342191,0.176229,2.403747,-0.716499,-0.090082,-0.415601,-0.631674


## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(15956, 3990)

In [None]:
y.value_counts()

0    10024
1     9922
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

Cross-Validation Scores: [0.61967419 0.6295832  0.64431213 0.64556565 0.64619242]
Mean CV Accuracy: 0.6371


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6446115288220552
F1 Score: 0.6472636815920398
Precision: 0.6340155945419104
Recall: 0.6610772357723578


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

Cross-Validation Scores: [0.57393484 0.54277656 0.56345973 0.56377311 0.5756816 ]
Mean CV Accuracy: 0.5639


In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.562155388471178
F1 Score: 0.5622650964670509
Precision: 0.5546218487394958
Recall: 0.5701219512195121


## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

KeyboardInterrupt: ignored

In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## SVM

In [None]:
svm_model = SVC()

k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## Naive Bayes

In [None]:
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Cross-Validation Scores: [0.56328321 0.52209339 0.5656534  0.52742087 0.53118145]
Mean CV Accuracy: 0.5419
Naive Bayes Accuracy: 0.5340852130325815
Naive Bayes F1 Score: 0.6693935621554329
Naive Bayes Precision: 0.5149110807113543
Naive Bayes Recall: 0.9563008130081301


## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

Cross-Validation Scores: [0.59116541 0.58602319 0.58790348 0.5976183  0.60075212]
Mean CV Accuracy: 0.5927
KNN Accuracy: 0.5927318295739349
KNN F1 Score: 0.5903705570960424
KNN Precision: 0.5857928964482241
KNN Recall: 0.595020325203252


# Models trained on Lead 1+2+3 ...+12 = 132 features using Standard Normalization

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
merged_df.drop("Unnamed: 0", axis=1, inplace=True)
# merged_df.dropna(inplace=True)

In [None]:
# merged_df = merged_df[merged_df["lead_id"] < 4]
merged_df

Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,target
0,0,1,0.003317,362.750000,-1487.032658,4866.090041,42.153846,0.374914,2.724239,11.827294,10227.841732,979.097871,0.165403,0
1,0,2,-0.029219,362.750000,-1827.990737,2519.254928,46.230769,0.379791,2.025500,9.056950,7096.289801,774.923781,0.165403,0
2,0,3,0.287826,544.000000,689.017622,2233.798276,70.777778,0.141953,0.136529,0.495643,-1191.790501,214.415848,0.110294,0
3,0,4,0.493832,362.833333,2528.446025,3658.739131,56.615385,0.197590,22.108417,0.979721,-3155.604773,705.472184,0.165365,0
4,0,5,0.033344,362.750000,-567.678786,3634.570304,38.153846,0.373217,2.044115,7.386252,6170.779988,609.334444,0.165403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259723,47524,8,0.111678,430.600000,-4708.010207,-3063.668425,28.454545,0.543850,0.563244,2.769434,15809.095292,849.619643,0.139340,0
259724,47524,9,-0.041636,430.600000,-4035.145231,19962.916163,51.272727,0.308238,6.358528,7.913448,6120.676380,811.098478,0.139340,0
259725,47524,10,-0.074027,430.500000,-7557.568669,18712.825254,51.909091,0.321825,7.217530,25.992243,25932.049990,2170.559253,0.139373,0
259726,47524,11,-0.051680,430.600000,-5577.490950,10692.711810,47.545455,0.322805,4.882626,27.577812,26714.108482,2191.407082,0.139340,0


In [None]:
merged_df.columns

Index(['patient_id', 'lead_id', 'pr_ratio', 'rr_distance', 'p_energy',
       't_energy', 'pq_distance', 'qt_interval', 'st_slope', 'pr_slope',
       'qrs_energy', 'rsq', 'heart_rate', 'target'],
      dtype='object')

In [None]:
pivoted_df = merged_df.pivot(index='patient_id', columns='lead_id',\
                             values=['pr_ratio', 'rr_distance', 'p_energy',\
                                     't_energy', 'pq_distance', 'qt_interval',\
                                     'st_slope', 'pr_slope', 'qrs_energy',\
                                     'rsq', 'heart_rate'])
pivoted_df.sort_index(axis=1, level=1, inplace=True)


In [None]:
pivoted_df

Unnamed: 0_level_0,heart_rate,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,...,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,t_energy
lead_id,1,1,1,1,1,1,1,1,1,1,...,12,12,12,12,12,12,12,12,12,12
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.165403,-1487.032658,42.153846,0.003317,11.827294,10227.841732,0.374914,362.750000,979.097871,2.724239,...,-1525.377198,46.846154,-0.016007,10.582440,9493.335105,0.380427,362.750000,882.649794,1.566013,1981.515161
4,0.180723,-3999.382812,71.333333,-0.091575,8.076431,20535.170781,0.532731,332.000000,1033.509926,1.491566,...,-8481.757693,51.266667,0.467205,1.031990,633.005789,0.303614,332.000000,359.459836,5.616999,16214.963795
5,0.200624,36.091650,50.375000,0.108335,3.502207,3550.596526,0.415250,299.066667,372.220884,0.872515,...,501.389153,47.250000,0.086495,9.623550,8335.146910,0.483271,299.133333,995.452288,2.179332,101.716769
7,0.157446,-162.183585,50.307692,0.063845,7.744824,8720.964097,0.372824,381.083333,784.383705,1.223054,...,1368.491775,50.692308,0.080108,9.501572,9486.827378,0.399670,381.083333,1056.452679,2.555984,827.486204
8,0.141358,1080.434961,45.166667,0.050815,12.549471,18074.595536,0.510459,424.454545,1006.832571,1.215399,...,-105.566789,54.071429,0.019773,10.992872,14615.976746,0.604924,359.076923,960.467586,1.068786,1863.862564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47511,0.116631,1785.341462,64.400000,0.364875,2.062872,-1548.806607,0.330842,514.444444,875.351027,6.380494,...,-1236.556205,69.900000,0.024962,4.429886,1759.900692,0.332786,514.444444,920.249353,5.553257,15228.660033
47515,0.149524,-493.991546,49.083333,0.026206,4.726638,2053.999423,0.348682,401.272727,650.805283,4.243753,...,770.100233,54.333333,0.155896,3.792664,2997.687612,0.348682,401.272727,569.290774,2.871507,6151.539165
47520,0.176248,-75.453028,46.266667,0.063902,9.228074,8703.798854,0.364247,340.428571,850.242574,1.664502,...,-983.349033,49.666667,-0.000896,11.247536,9571.013065,0.347601,340.428571,1061.442247,2.798689,-10.529457
47521,0.156658,-165.705194,87.307692,-0.014284,1.445051,393.744334,0.374172,383.000000,351.533843,3.065497,...,-217.639142,53.500000,0.173418,1.948655,1698.935124,0.478024,353.538462,328.710587,2.256374,3109.561910


In [None]:
pivoted_df = pivoted_df.reset_index()
pivoted_df

Unnamed: 0_level_0,patient_id,heart_rate,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,...,p_energy,pq_distance,pr_ratio,pr_slope,qrs_energy,qt_interval,rr_distance,rsq,st_slope,t_energy
lead_id,Unnamed: 1_level_1,1,1,1,1,1,1,1,1,1,...,12,12,12,12,12,12,12,12,12,12
0,0,0.165403,-1487.032658,42.153846,0.003317,11.827294,10227.841732,0.374914,362.750000,979.097871,...,-1525.377198,46.846154,-0.016007,10.582440,9493.335105,0.380427,362.750000,882.649794,1.566013,1981.515161
1,4,0.180723,-3999.382812,71.333333,-0.091575,8.076431,20535.170781,0.532731,332.000000,1033.509926,...,-8481.757693,51.266667,0.467205,1.031990,633.005789,0.303614,332.000000,359.459836,5.616999,16214.963795
2,5,0.200624,36.091650,50.375000,0.108335,3.502207,3550.596526,0.415250,299.066667,372.220884,...,501.389153,47.250000,0.086495,9.623550,8335.146910,0.483271,299.133333,995.452288,2.179332,101.716769
3,7,0.157446,-162.183585,50.307692,0.063845,7.744824,8720.964097,0.372824,381.083333,784.383705,...,1368.491775,50.692308,0.080108,9.501572,9486.827378,0.399670,381.083333,1056.452679,2.555984,827.486204
4,8,0.141358,1080.434961,45.166667,0.050815,12.549471,18074.595536,0.510459,424.454545,1006.832571,...,-105.566789,54.071429,0.019773,10.992872,14615.976746,0.604924,359.076923,960.467586,1.068786,1863.862564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21639,47511,0.116631,1785.341462,64.400000,0.364875,2.062872,-1548.806607,0.330842,514.444444,875.351027,...,-1236.556205,69.900000,0.024962,4.429886,1759.900692,0.332786,514.444444,920.249353,5.553257,15228.660033
21640,47515,0.149524,-493.991546,49.083333,0.026206,4.726638,2053.999423,0.348682,401.272727,650.805283,...,770.100233,54.333333,0.155896,3.792664,2997.687612,0.348682,401.272727,569.290774,2.871507,6151.539165
21641,47520,0.176248,-75.453028,46.266667,0.063902,9.228074,8703.798854,0.364247,340.428571,850.242574,...,-983.349033,49.666667,-0.000896,11.247536,9571.013065,0.347601,340.428571,1061.442247,2.798689,-10.529457
21642,47521,0.156658,-165.705194,87.307692,-0.014284,1.445051,393.744334,0.374172,383.000000,351.533843,...,-217.639142,53.500000,0.173418,1.948655,1698.935124,0.478024,353.538462,328.710587,2.256374,3109.561910


In [None]:
merged_df = pd.merge(pivoted_df, target_df, on='patient_id', how='inner')

  merged_df = pd.merge(pivoted_df, target_df, on='patient_id', how='inner')


In [None]:
merged_df.dropna(inplace=True)

In [None]:
merged_df.columns

Index([       'patient_id',  ('patient_id', ''),   ('heart_rate', 1),
           ('p_energy', 1),  ('pq_distance', 1),     ('pr_ratio', 1),
           ('pr_slope', 1),   ('qrs_energy', 1),  ('qt_interval', 1),
        ('rr_distance', 1),
       ...
       ('pq_distance', 12),    ('pr_ratio', 12),    ('pr_slope', 12),
        ('qrs_energy', 12), ('qt_interval', 12), ('rr_distance', 12),
               ('rsq', 12),    ('st_slope', 12),    ('t_energy', 12),
                  'target'],
      dtype='object', length=135)

In [None]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

merged_df = merged_df.iloc[:,2:]

X = merged_df.drop(["target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,"(heart_rate, 1)","(p_energy, 1)","(pq_distance, 1)","(pr_ratio, 1)","(pr_slope, 1)","(qrs_energy, 1)","(qt_interval, 1)","(rr_distance, 1)","(rsq, 1)","(st_slope, 1)",...,"(p_energy, 12)","(pq_distance, 12)","(pr_ratio, 12)","(pr_slope, 12)","(qrs_energy, 12)","(qt_interval, 12)","(rr_distance, 12)","(rsq, 12)","(st_slope, 12)","(t_energy, 12)"
0,-0.103354,-0.440280,-0.555132,-0.085192,1.351302,0.931919,-0.355755,-0.090568,0.832510,-0.131154,...,-0.492108,-0.672732,-0.048255,0.207230,-0.154556,-0.371780,-0.090997,-0.044946,-0.078015,0.096086
1,1.049698,-0.624571,-0.864470,-0.124955,0.558535,-0.231710,-0.037746,-0.607749,0.461357,1.248138,...,-0.852259,-0.810495,-0.065161,2.023347,1.264425,0.003257,-0.574499,1.605917,0.693341,0.138787
2,-0.001339,-1.808980,-1.272582,-0.235125,2.889365,1.099287,-0.330450,-0.145596,1.576174,0.692785,...,-1.375041,-0.049494,-0.151608,1.025913,1.020806,-0.350759,-0.142441,0.914157,0.054003,0.652342
3,-0.609747,0.181029,0.230647,-0.082906,0.952843,1.061109,-0.861492,0.218875,0.995635,0.081856,...,-1.236213,0.020640,-0.117488,1.418489,1.160305,-0.877365,0.198296,1.477873,0.689271,1.469211
4,1.657451,0.019974,-1.004907,-0.110009,-1.066690,-1.126361,0.069685,-0.810237,-1.175406,0.034699,...,-0.180991,-1.580093,-0.055100,0.000136,-0.785650,0.087473,-0.765721,-0.256204,1.047664,-0.039778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18968,-0.487986,-2.306266,0.920441,-0.260700,1.159197,1.100662,-0.831183,0.138382,1.387909,0.308935,...,-1.720291,1.225673,-0.162726,0.849830,1.180161,-0.827856,0.122443,1.218783,-0.153584,0.509656
18969,-0.581744,1.515493,2.905875,0.036587,-0.491410,0.034045,-0.136428,0.199986,0.551880,-0.110610,...,0.716717,2.914632,0.048854,-0.930865,-0.586754,-0.214139,0.180637,-0.059178,0.076762,-0.257013
18970,0.313345,0.593608,0.464917,0.866656,-1.262946,-0.120590,1.474060,-0.302484,-1.073564,-0.424648,...,-0.358056,-0.281149,-0.056028,0.262766,0.097306,-0.332345,-0.289660,0.072027,0.127590,0.333238
18971,0.081897,-0.167467,0.870550,-0.084790,-0.233415,-0.190693,-0.030023,-0.188916,0.053499,0.152031,...,-0.460566,0.712418,-0.060735,0.203625,0.175092,-0.078156,-0.183487,0.651734,0.043874,0.275021


## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(15178, 3795)

In [None]:
y.value_counts()

0    9547
1    9426
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-Validation Scores: [0.67094862 0.64163373 0.66501976 0.65700165 0.65667216]
Mean CV Accuracy: 0.6583


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6600790513833992
F1 Score: 0.6569148936170213
Precision: 0.6562167906482466
Recall: 0.6576144834930777


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

Cross-Validation Scores: [0.57444005 0.57378129 0.56785244 0.57166392 0.5815486 ]
Mean CV Accuracy: 0.5739


In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5744400527009222
F1 Score: 0.5712768781523759
Precision: 0.5630559916274201
Recall: 0.5797413793103449


## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

Cross-Validation Scores: [0.66600791 0.65316206 0.65349144 0.64481054 0.65634267]
Mean CV Accuracy: 0.6548


In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6484848484848484
F1 Score: 0.6423592493297587
Precision: 0.6392742796157951
Recall: 0.6454741379310345


## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

Cross-Validation Scores: [0.66699605 0.65283267 0.64657444 0.63789127 0.64942339]
Mean CV Accuracy: 0.6507


In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6540184453227932
F1 Score: 0.6456140350877192
Precision: 0.6468361276365603
Recall: 0.6443965517241379


## SVM

In [None]:
svm_model = SVC()

k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## Naive Bayes

In [None]:
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

Cross-Validation Scores: [0.5326087  0.53557312 0.57509881 0.56573311 0.60658979]
Mean CV Accuracy: 0.5631
Naive Bayes Accuracy: 0.5470355731225296
Naive Bayes F1 Score: 0.676453980801807
Naive Bayes Precision: 0.5231441048034935
Naive Bayes Recall: 0.9568690095846646


## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

Cross-Validation Scores: [0.60111989 0.59782609 0.61791831 0.60032949 0.60724876]
Mean CV Accuracy: 0.6049
KNN Accuracy: 0.6076416337285903
KNN F1 Score: 0.6057717765422292
KNN Precision: 0.6024223275408109
KNN Recall: 0.6091586794462194


# Models on all data using Standard Normalization

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
merged_df.drop("Unnamed: 0", axis=1, inplace=True)
merged_df.dropna(inplace=True)

In [None]:
# merged_df = merged_df[merged_df["lead_id"] == 1]
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

X = merged_df.drop(["patient_id", "lead_id", "target"], axis=1)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate
0,0.033478,-0.403574,-2.062318,-2.022480,-1.081605,2.942311,-0.156134,-0.719880,0.742807,0.486332,0.577951
1,-0.020684,-0.262673,0.204174,-0.456211,-1.713042,-0.179232,-0.125740,2.109357,0.483752,0.610819,0.227760
2,-0.012659,-0.160377,0.416775,-0.507302,1.404750,0.278695,-0.601723,-0.616834,-0.235472,-0.824997,0.003361
3,-0.027192,-0.588955,-0.561479,-0.639279,-0.765597,0.702212,-0.302789,3.113318,1.775446,1.628513,1.130177
4,-0.029059,-0.456642,-0.468998,1.241088,-0.598931,-0.144646,2.150946,2.816421,0.363699,2.713107,0.724251
...,...,...,...,...,...,...,...,...,...,...,...
249443,-0.029697,-0.360951,-0.311076,-0.013348,-0.388721,0.383035,-0.283535,-0.015823,-0.100914,-0.457656,0.466453
249444,-0.032939,0.004905,-0.384794,1.010542,0.504670,-0.132976,0.953083,0.400180,-0.332144,0.935973,-0.315836
249445,-0.024524,0.208624,0.128317,-0.380595,0.140306,-0.083563,-0.475478,2.181970,1.703293,1.349422,-0.649787
249446,-0.020091,-0.385957,0.264883,-0.731115,0.427054,0.158807,-0.679581,-0.517953,-0.187578,-0.805119,0.531242


## Training and Testing on Complete Data

In [None]:
y = merged_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

(199558, 49890)

In [None]:
y.value_counts()

0    125209
1    124239
Name: target, dtype: int64

## Logistic Regression

In [None]:
logress_model = LogisticRegression()

k_fold = 5
cv_scores = cross_val_score(logress_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

logress_model.fit(X_train, y_train)

log1_pred = logress_model.predict(X_test)

Cross-Validation Scores: [0.52948988 0.52540589 0.53344859 0.53363734 0.52652151]
Mean CV Accuracy: 0.5297


In [None]:
accuracy = accuracy_score(y_test, log1_pred)
f1 = f1_score(y_test, log1_pred)
precision = precision_score(y_test, log1_pred)
recall = recall_score(y_test, log1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5305472038484667
F1 Score: 0.4933918799074215
Precision: 0.5344674070949904
Recall: 0.4581793347260164


## Decision Trees

In [None]:
decision_tree_model = DecisionTreeClassifier()

k_fold = 5
cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

decision_tree_model.fit(X_train, y_train)

dt1_pred = decision_tree_model.predict(X_test)

Cross-Validation Scores: [0.53026659 0.52643315 0.5298657  0.52273809 0.52574478]
Mean CV Accuracy: 0.5270


In [None]:
accuracy = accuracy_score(y_test, dt1_pred)
f1 = f1_score(y_test, dt1_pred)
precision = precision_score(y_test, dt1_pred)
recall = recall_score(y_test, dt1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5309280416917218
F1 Score: 0.5286416371253626
Precision: 0.5300937146550331
Recall: 0.5271974931704966


## Random Forest

In [None]:
rf_model = RandomForestClassifier()

k_fold = 5
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

rf_model.fit(X_train, y_train)

rf1_pred = rf_model.predict(X_test)

KeyboardInterrupt: ignored

In [None]:
accuracy = accuracy_score(y_test, rf1_pred)
f1 = f1_score(y_test, rf1_pred)
precision = precision_score(y_test, rf1_pred)
recall = recall_score(y_test, rf1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## Xgboost

In [None]:
xgboost_model = xgb.XGBClassifier(subsample=0.8, max_depth= 6, learning_rate= 0.2, colsample_bytree= 1.0)

k_fold = 5
cv_scores = cross_val_score(xgboost_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

xgboost_model.fit(X_train, y_train)

xgb1_pred = xgboost_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, xgb1_pred)
f1 = f1_score(y_test, xgb1_pred)
precision = precision_score(y_test, xgb1_pred)
recall = recall_score(y_test, xgb1_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## SVM

In [None]:
svm_model = SVC()
k_fold = 5
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

## Naive Bayes

In [None]:
naive_bayes_model = GaussianNB()

k_fold = 5
cv_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

naive_bayes_model.fit(X_train, y_train)

# Make predictions
naive_bayes_pred = naive_bayes_model.predict(X_test)

# Calculate metrics
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)
naive_bayes_f1 = f1_score(y_test, naive_bayes_pred)
naive_bayes_precision = precision_score(y_test, naive_bayes_pred)
naive_bayes_recall = recall_score(y_test, naive_bayes_pred)

# Print the results for Naive Bayes
print("Naive Bayes Accuracy:", naive_bayes_accuracy)
print("Naive Bayes F1 Score:", naive_bayes_f1)
print("Naive Bayes Precision:", naive_bayes_precision)
print("Naive Bayes Recall:", naive_bayes_recall)

## KNN

In [None]:
knn_model = KNeighborsClassifier()

k_fold = 5
cv_scores = cross_val_score(knn_model, X_train, y_train, cv=k_fold, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f}')

knn_model.fit(X_train, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)

# Print the results for K-nearest Neighbors
print("KNN Accuracy:", knn_accuracy)
print("KNN F1 Score:", knn_f1)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)

# ANN

In [None]:
import tensorflow as tf

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/DIS_Dr_Liu/Datasets/downsampled_feature_labels.csv")
merged_df.dropna(inplace=True)
merged_df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,lead_id,pr_ratio,rr_distance,p_energy,t_energy,pq_distance,qt_interval,st_slope,pr_slope,qrs_energy,rsq,heart_rate,target
0,0,0,1,0.003317,362.75,-1487.032658,4866.090041,42.153846,0.374914,2.724239,11.827294,10227.841732,979.097871,0.165403,0
1,1,0,2,-0.029219,362.75,-1827.990737,2519.254928,46.230769,0.379791,2.0255,9.05695,7096.289801,774.923781,0.165403,0
2,2,0,3,0.287826,544.0,689.017622,2233.798276,70.777778,0.141953,0.136529,0.495643,-1191.790501,214.415848,0.110294,0
3,3,0,4,0.493832,362.833333,2528.446025,3658.739131,56.615385,0.19759,22.108417,0.979721,-3155.604773,705.472184,0.165365,0
4,4,0,5,0.033344,362.75,-567.678786,3634.570304,38.153846,0.373217,2.044115,7.386252,6170.779988,609.334444,0.165403,0


In [None]:
X = merged_df.drop(["Unnamed: 0", "patient_id", "lead_id", "target"], axis=1)
y = merged_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(200, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

In [None]:
# Compile the model with categorical crossentropy
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Convert labels to one-hot encoding for categorical crossentropy
y_train_one_hot = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test_one_hot = tf.keras.utils.to_categorical(y_test, num_classes=2)

# Train the model
model.fit(X_train, y_train_one_hot, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
# Make predictions on the test set
y_pred_probabilities = model.predict(X_test)
y_pred_binary = np.argmax(y_pred_probabilities, axis=1)  # Convert probabilities to binary predictions



In [None]:
print("Testing Performance")
accuracy = accuracy_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy on test set: 0.5027460412908399
