In [1]:
from sklearn.neighbors import KNeighborsClassifier
import os

os.chdir('/Users/shreygupta/Documents/Classes/CS598DLH/')
os.getcwd()

from feature_generation import FeatureGeneration
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import csv
import numpy as np
import pandas as pd
import collections
from imblearn.over_sampling import SMOTE

In [2]:
class KNN:
    def __init__(self, x_train, y_train, x_test, y_test, n):
        self.knn = KNeighborsClassifier(n_neighbors=n)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def train(self):
        self.knn.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.knn.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        return f1_macro, f1_micro

In [3]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous_Insufficiency']
column_headings = ["Morbidity Class", "KNN1_Macro F1", "KNN1_Micro F1", "KNN5_Macro F1", "KNN5_Micro F1"]

with open("./results/word-embedding/performance_KNN_W2V.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(column_headings)

In [4]:
all_f1_macro1_scores = []
all_f1_micro1_scores = []

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).word2vec()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro1 = 1
        f1_micro1 = 1
        f1_macro5 = 1
        f1_micro5 = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list1 = []
        f1_micro_list1 = []
        f1_macro_list5 = []
        f1_micro_list5 = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training KNN using TF-IDF Representation
            knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1)
            knn1_obj.train()

            f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

            f1_macro_list1.append(f1_macro1)
            f1_micro_list1.append(f1_micro1)

            knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
            knn5_obj.train()

            f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

            f1_macro_list5.append(f1_macro5)
            f1_micro_list5.append(f1_micro5)

        f1_macro1 = np.mean(f1_macro_list1)
        f1_micro1 = np.mean(f1_micro_list1)
        f1_macro5 = np.mean(f1_macro_list5)
        f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/word-embedding/performance_KNN_W2V.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_KNN_W2V.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
Asthma
For n=1, Macro F1 score: 0.8379080847772797 and Micro F1 Score 0.8447619047619048
For n=5, Macro F1 score: 0.7799212625180119 and Micro F1 Score 0.7876190476190476
CAD
CAD
For n=1, Macro F1 score: 0.5876587301587302 and Micro F1 Score 0.5964285714285714
For n=5, Macro F1 score: 0.5848015873015874 and Micro F1 Score 0.5982142857142858
CHF
For n=1, Macro F1 score: 1 and Micro F1 Score 1
For n=5, Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
For n=1, Macro F1 score: 0.8048914230764076 and Micro F1 Score 0.8186813186813188
For n=5, Macro F1 score: 0.8034926034577736 and Micro F1 Score 0.8192307692307692
Diabetes
Diabetes
For n=1, Macro F1 score: 0.4117099567099567 and Micro F1 Score 0.4333333333333333
For n=5, Macro F1 score: 0.501991341991342 and Micro F1 Score 0.5277777777777778
Gallstones
Gallstones
For n=1, Macro F1 score: 0.8553228307640073 and Micro F1 Score 0.8587912087912087
For n=5, Macro F1 score: 0.7165867139396551 and Micro F1 Score 0.73131868131868

In [5]:
with open("./results/word-embedding/performance_KNN_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(column_headings)

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).gloVe()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro1 = 1
        f1_micro1 = 1
        f1_macro5 = 1
        f1_micro5 = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list1 = []
        f1_micro_list1 = []
        f1_macro_list5 = []
        f1_micro_list5 = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training KNN using TF-IDF Representation
            knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1)
            knn1_obj.train()

            f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

            f1_macro_list1.append(f1_macro1)
            f1_micro_list1.append(f1_micro1)

            knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
            knn5_obj.train()

            f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

            f1_macro_list5.append(f1_macro5)
            f1_micro_list5.append(f1_micro5)

        f1_macro1 = np.mean(f1_macro_list1)
        f1_micro1 = np.mean(f1_micro_list1)
        f1_macro5 = np.mean(f1_macro_list5)
        f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/word-embedding/performance_KNN_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_KNN_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
Asthma
For n=1, Macro F1 score: 0.8972188024393908 and Micro F1 Score 0.9028571428571428
For n=5, Macro F1 score: 0.8071921727223585 and Micro F1 Score 0.8252380952380951
CAD
CAD
For n=1, Macro F1 score: 0.7259271284271284 and Micro F1 Score 0.7357142857142857
For n=5, Macro F1 score: 0.7119841269841269 and Micro F1 Score 0.7196428571428571
CHF
For n=1, Macro F1 score: 1 and Micro F1 Score 1
For n=5, Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
For n=1, Macro F1 score: 0.8254166666666667 and Micro F1 Score 0.8406593406593407
For n=5, Macro F1 score: 0.7677587853323147 and Micro F1 Score 0.7895604395604396
Diabetes
Diabetes
For n=1, Macro F1 score: 0.6117424242424242 and Micro F1 Score 0.6263888888888889
For n=5, Macro F1 score: 0.653961038961039 and Micro F1 Score 0.6597222222222222
Gallstones
Gallstones
For n=1, Macro F1 score: 0.8460410422910423 and Micro F1 Score 0.8582417582417582
For n=5, Macro F1 score: 0.7096711458476165 and Micro F1 Score 0.75329670329670

In [6]:
with open("./results/word-embedding/performance_KNN_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(column_headings)

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).fastText()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro1 = 1
        f1_micro1 = 1
        f1_macro5 = 1
        f1_micro5 = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list1 = []
        f1_micro_list1 = []
        f1_macro_list5 = []
        f1_micro_list5 = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training KNN using TF-IDF Representation
            knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1)
            knn1_obj.train()

            f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

            f1_macro_list1.append(f1_macro1)
            f1_micro_list1.append(f1_micro1)

            knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
            knn5_obj.train()

            f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

            f1_macro_list5.append(f1_macro5)
            f1_micro_list5.append(f1_micro5)

        f1_macro1 = np.mean(f1_macro_list1)
        f1_micro1 = np.mean(f1_micro_list1)
        f1_macro5 = np.mean(f1_macro_list5)
        f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/word-embedding/performance_KNN_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_KNN_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
Asthma
For n=1, Macro F1 score: 0.8130639346893215 and Micro F1 Score 0.8238095238095238
For n=5, Macro F1 score: 0.7378229360079205 and Micro F1 Score 0.7523809523809524
CAD
CAD
For n=1, Macro F1 score: 0.5658874458874459 and Micro F1 Score 0.5821428571428571
For n=5, Macro F1 score: 0.6610317460317461 and Micro F1 Score 0.675
CHF
For n=1, Macro F1 score: 1 and Micro F1 Score 1
For n=5, Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
For n=1, Macro F1 score: 0.7926565664297862 and Micro F1 Score 0.8038461538461539
For n=5, Macro F1 score: 0.7487416015357191 and Micro F1 Score 0.7664835164835166
Diabetes
Diabetes
For n=1, Macro F1 score: 0.5695093795093795 and Micro F1 Score 0.5847222222222223
For n=5, Macro F1 score: 0.4702092352092352 and Micro F1 Score 0.5013888888888889
Gallstones
Gallstones
For n=1, Macro F1 score: 0.8314054001554002 and Micro F1 Score 0.8516483516483516
For n=5, Macro F1 score: 0.8256996271702153 and Micro F1 Score 0.8445054945054945
GERD
GERD

In [7]:
with open("./results/word-embedding/performance_KNN_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(column_headings)

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).universal_sentence_encoder()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro1 = 1
        f1_micro1 = 1
        f1_macro5 = 1
        f1_micro5 = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list1 = []
        f1_micro_list1 = []
        f1_macro_list5 = []
        f1_micro_list5 = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training KNN using TF-IDF Representation
            knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1)
            knn1_obj.train()

            f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

            f1_macro_list1.append(f1_macro1)
            f1_micro_list1.append(f1_micro1)

            knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
            knn5_obj.train()

            f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

            f1_macro_list5.append(f1_macro5)
            f1_micro_list5.append(f1_micro5)

        f1_macro1 = np.mean(f1_macro_list1)
        f1_micro1 = np.mean(f1_micro_list1)
        f1_macro5 = np.mean(f1_macro_list5)
        f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/word-embedding/performance_KNN_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_KNN_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-05-06 18:30:14.924538: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Asthma
For n=1, Macro F1 score: 0.8889355600539812 and Micro F1 Score 0.8952380952380954
For n=5, Macro F1 score: 0.7992821123104472 and Micro F1 Score 0.8314285714285715
CAD
CAD
For n=1, Macro F1 score: 0.7121212121212122 and Micro F1 Score 0.7321428571428571
For n=5, Macro F1 score: 0.7443650793650795 and Micro F1 Score 0.7535714285714286
CHF
For n=1, Macro F1 score: 1 and Micro F1 Score 1
For n=5, Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
For n=1, Macro F1 score: 0.8621091490209137 and Micro F1 Score 0.8714285714285716
For n=5, Macro F1 score: 0.6946850535085829 and Micro F1 Score 0.7274725274725274
Diabetes
Diabetes
For n=1, Macro F1 score: 0.6966253191253191 and Micro F1 Score 0.7097222222222223
For n=5, Macro F1 score: 0.6520329670329671 and Micro F1 Score 0.6625
Gallstones
Gallstones
For n=1, Macro F1 score: 0.7769029581529582 and Micro F1 Score 0.8131868131868132
For n=5, Macro F1 score: 0.7304315713526239 and Micro F1 Score 0.7681318681318681
GERD
GERD
For n