In [1]:
from sklearn.neighbors import KNeighborsClassifier
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import GloVeFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import FastTextFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import USEFeatureGeneration
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import csv
import numpy as np
import pandas as pd

In [2]:
class KNN:
    def __init__(self, x_train, y_train, x_test, y_test, n):
        self.knn = KNeighborsClassifier(n_neighbors=n)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def train(self):
        self.knn.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.knn.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        return f1_macro, f1_micro

In [3]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']
column_headings = ["Morbidity Class", "KNN1_Macro F1", "KNN1_Micro F1", "KNN5_Macro F1", "KNN5_Micro F1"]

with open("./results/word-embeddings-features/performance_KNN_W2V.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2], column_headings[3], column_headings[4]])

In [4]:
all_f1_macro1_scores = []
all_f1_micro1_scores = []

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = Word2VecFeatureGeneration(train_preprocessed_df, morbidity).word2vec_matrix_gen()
    X = np.average(X, axis=1)
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list1 = []
    f1_micro_list1 = []
    f1_macro_list5 = []
    f1_micro_list5 = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training KNN using TF-IDF Representation
        knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1)
        knn1_obj.train()

        f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

        f1_macro_list1.append(f1_macro1)
        f1_micro_list1.append(f1_micro1)

        knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
        knn5_obj.train()

        f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

        f1_macro_list5.append(f1_macro5)
        f1_micro_list5.append(f1_micro5)

    f1_macro1 = np.mean(f1_macro_list1)
    f1_micro1 = np.mean(f1_micro_list1)
    f1_macro5 = np.mean(f1_macro_list5)
    f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/word-embeddings-features/performance_KNN_W2V.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_KNN_W2V.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
(66, 494, 300) (66,) Counter({0.0: 63, 1.0: 3})
For n=1, Macro F1 score: 0.8332750582750583 and Micro F1 Score 0.9404761904761905
For n=5, Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
(62, 495, 300) (62,) Counter({0.0: 35, 1.0: 27})
For n=1, Macro F1 score: 0.47404761904761905 and Micro F1 Score 0.5
For n=5, Macro F1 score: 0.5053174603174604 and Micro F1 Score 0.55
CHF
(12, 494, 300) (12,) Counter({1.0: 12})
For n=1, Macro F1 score: 1.0 and Micro F1 Score 1.0
For n=5, Macro F1 score: 1.0 and Micro F1 Score 1.0
Depression
(66, 495, 300) (66,) Counter({0.0: 58, 1.0: 8})
For n=1, Macro F1 score: 0.7015501165501166 and Micro F1 Score 0.8666666666666666
For n=5, Macro F1 score: 0.7444289044289044 and Micro F1 Score 0.8976190476190474
Diabetes
(63, 495, 300) (63,) Counter({1.0: 35, 0.0: 28})
For n=1, Macro F1 score: 0.5213095238095238 and Micro F1 Score 0.5428571428571429
For n=5, Macro F1 score: 0.4730952380952381 and Micro F1 Score 0.5214285714285716


In [5]:
with open("./results/word-embeddings-features/performance_KNN_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2], column_headings[3], column_headings[4]])

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = GloVeFeatureGeneration(train_preprocessed_df, morbidity).glove_matrix_gen()
    X = np.average(X, axis=1)
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list1 = []
    f1_micro_list1 = []
    f1_macro_list5 = []
    f1_micro_list5 = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training KNN using TF-IDF Representation
        knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1)
        knn1_obj.train()

        f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

        f1_macro_list1.append(f1_macro1)
        f1_micro_list1.append(f1_micro1)

        knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
        knn5_obj.train()

        f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

        f1_macro_list5.append(f1_macro5)
        f1_micro_list5.append(f1_micro5)

    f1_macro1 = np.mean(f1_macro_list1)
    f1_micro1 = np.mean(f1_micro_list1)
    f1_macro5 = np.mean(f1_macro_list5)
    f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/word-embeddings-features/performance_KNN_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_KNN_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
For n=1, Macro F1 score: 0.6755244755244754 and Micro F1 Score 0.9095238095238095
For n=5, Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
For n=1, Macro F1 score: 0.6946031746031744 and Micro F1 Score 0.7238095238095237
For n=5, Macro F1 score: 0.7132142857142856 and Micro F1 Score 0.7547619047619047
CHF
For n=1, Macro F1 score: 1.0 and Micro F1 Score 1.0
For n=5, Macro F1 score: 1.0 and Micro F1 Score 1.0
Depression
For n=1, Macro F1 score: 0.5121639471639471 and Micro F1 Score 0.7285714285714285
For n=5, Macro F1 score: 0.661095571095571 and Micro F1 Score 0.8642857142857142
Diabetes
For n=1, Macro F1 score: 0.511547619047619 and Micro F1 Score 0.5333333333333334
For n=5, Macro F1 score: 0.4453968253968254 and Micro F1 Score 0.46904761904761905
Gallstones
For n=1, Macro F1 score: 0.5452447552447552 and Micro F1 Score 0.7333333333333333
For n=5, Macro F1 score: 0.571037296037296 and Micro F1 Score 0.8023809523809522
GERD
For n=1, Macro F1 score: 0.6

In [6]:
with open("./results/word-embeddings-features/performance_KNN_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2], column_headings[3], column_headings[4]])

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FastTextFeatureGeneration(train_preprocessed_df, morbidity).fasttext_matrix_gen()
    X = np.average(X, axis=1)
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list1 = []
    f1_micro_list1 = []
    f1_macro_list5 = []
    f1_micro_list5 = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training KNN using TF-IDF Representation
        knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1)
        knn1_obj.train()

        f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

        f1_macro_list1.append(f1_macro1)
        f1_micro_list1.append(f1_micro1)

        knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
        knn5_obj.train()

        f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

        f1_macro_list5.append(f1_macro5)
        f1_micro_list5.append(f1_micro5)

    f1_macro1 = np.mean(f1_macro_list1)
    f1_micro1 = np.mean(f1_micro_list1)
    f1_macro5 = np.mean(f1_macro_list5)
    f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/word-embeddings-features/performance_KNN_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_KNN_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
(66, 494, 300) (66,) Counter({0.0: 63, 1.0: 3})
For n=1, Macro F1 score: 0.7248834498834499 and Micro F1 Score 0.9095238095238096
For n=5, Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
(62, 495, 300) (62,) Counter({0.0: 35, 1.0: 27})
For n=1, Macro F1 score: 0.6869444444444445 and Micro F1 Score 0.7095238095238094
For n=5, Macro F1 score: 0.530595238095238 and Micro F1 Score 0.5666666666666667
CHF
(12, 494, 300) (12,) Counter({1.0: 12})
For n=1, Macro F1 score: 1.0 and Micro F1 Score 1.0
For n=5, Macro F1 score: 1.0 and Micro F1 Score 1.0
Depression
(66, 495, 300) (66,) Counter({0.0: 58, 1.0: 8})
For n=1, Macro F1 score: 0.7404817404817404 and Micro F1 Score 0.9095238095238095
For n=5, Macro F1 score: 0.636037296037296 and Micro F1 Score 0.8666666666666668
Diabetes
(63, 495, 300) (63,) Counter({1.0: 35, 0.0: 28})
For n=1, Macro F1 score: 0.41234126984126984 and Micro F1 Score 0.43809523809523815
For n=5, Macro F1 score: 0.48880952380952375 and Micro

In [7]:
with open("./results/word-embeddings-features/performance_KNN_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2], column_headings[3], column_headings[4]])

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = USEFeatureGeneration(train_preprocessed_df, morbidity).use_matrix_gen()
    X = np.average(X, axis=1)
    X = X.reshape(-1, 1)
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list1 = []
    f1_micro_list1 = []
    f1_macro_list5 = []
    f1_micro_list5 = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training KNN using TF-IDF Representation
        knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1)
        knn1_obj.train()

        f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

        f1_macro_list1.append(f1_macro1)
        f1_micro_list1.append(f1_micro1)

        knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
        knn5_obj.train()

        f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

        f1_macro_list5.append(f1_macro5)
        f1_micro_list5.append(f1_micro5)

    f1_macro1 = np.mean(f1_macro_list1)
    f1_micro1 = np.mean(f1_micro_list1)
    f1_macro5 = np.mean(f1_macro_list5)
    f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/word-embeddings-features/performance_KNN_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_KNN_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma


2023-05-01 14:09:26.499097: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


(66, 512) (66,) Counter({0.0: 63, 1.0: 3})
For n=1, Macro F1 score: 0.6209790209790209 and Micro F1 Score 0.8928571428571427
For n=5, Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
(62, 512) (62,) Counter({0.0: 35, 1.0: 27})
For n=1, Macro F1 score: 0.6442857142857142 and Micro F1 Score 0.6619047619047618
For n=5, Macro F1 score: 0.5983080808080808 and Micro F1 Score 0.6333333333333334
CHF
(12, 512) (12,) Counter({1.0: 12})
For n=1, Macro F1 score: 1.0 and Micro F1 Score 1.0
For n=5, Macro F1 score: 1.0 and Micro F1 Score 1.0
Depression
(66, 512) (66,) Counter({0.0: 58, 1.0: 8})
For n=1, Macro F1 score: 0.6224592074592075 and Micro F1 Score 0.8476190476190476
For n=5, Macro F1 score: 0.6075174825174824 and Micro F1 Score 0.85
Diabetes
(63, 512) (63,) Counter({1.0: 35, 0.0: 28})
For n=1, Macro F1 score: 0.5700793650793651 and Micro F1 Score 0.588095238095238
For n=5, Macro F1 score: 0.5931926406926407 and Micro F1 Score 0.6571428571428571
Gallstones
(66, 512