In [3]:
from sklearn import svm
import os

os.chdir('/Users/shreygupta/Documents/Classes/CS598DLH/')
os.getcwd()

from feature_generation import FeatureGeneration
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import pandas as pd
import csv
import numpy as np
import collections
from imblearn.over_sampling import SMOTE



In [4]:
class SVM:
    def __init__(self, x_train, y_train, x_test, y_test):
        self.svm = svm.SVC(kernel='linear')
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        
    def train(self):
        
        self.svm.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.svm.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')

        return f1_macro, f1_micro

In [5]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous_Insufficiency']

In [6]:
column_headings = ["Morbidity Class", "SVM_Macro F1", "SVM_Micro F1"]

with open("./results/word-embedding/performance_SVM_W2V.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow(column_headings)
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).word2vec()
    X = np.average(X, axis=1)
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("./results/word-embedding/performance_SVM_W2V.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_SVM_W2V.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)
    

Asthma
Macro F1 score: 0.6962666912550813 and Micro F1 Score 0.7166666666666668
CAD
Macro F1 score: 0.3704978354978355 and Micro F1 Score 0.4196428571428571
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.6060402097902099 and Micro F1 Score 0.6142857142857144
Diabetes
Macro F1 score: 0.3147752247752248 and Micro F1 Score 0.40694444444444444
Gallstones
Macro F1 score: 0.5197640921170332 and Micro F1 Score 0.5285714285714286
GERD
Macro F1 score: 0.6059426684426684 and Micro F1 Score 0.6272727272727273
Gout
Macro F1 score: 0.6751655108737152 and Micro F1 Score 0.6885714285714285
Hypercholesterolemia
Macro F1 score: 0.3299045399045399 and Micro F1 Score 0.38
Hypertension
Macro F1 score: 0.515017205017205 and Micro F1 Score 0.53
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.3806807571513454 and Micro F1 Score 0.43910256410256404
Obesity
Macro F1 score: 0.44934343434343427 and Micro F1 Score 0.4708333333333333
OSA
Macro F1 score: 0.33709736685742875 and Micr

In [7]:
with open("./results/word-embedding/performance_SVM_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow(column_headings)

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).gloVe()

    X = np.abs(np.average(X, axis=1))

    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_SVM_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_SVM_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Asthma
Macro F1 score: 0.919940296631473 and Micro F1 Score 0.9223809523809525
CAD
CAD
Macro F1 score: 0.7361255411255411 and Micro F1 Score 0.7446428571428572
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
Macro F1 score: 0.7992169431875314 and Micro F1 Score 0.8115384615384615
Diabetes
Diabetes
Macro F1 score: 0.6073160173160174 and Micro F1 Score 0.6152777777777778
Gallstones
Gallstones
Macro F1 score: 0.9183071095571096 and Micro F1 Score 0.9258241758241759
GERD
GERD
Macro F1 score: 0.8050100389806272 and Micro F1 Score 0.8181818181818181
Gout
Gout
Macro F1 score: 0.9932126696832579 and Micro F1 Score 0.9933333333333334
Hypercholesterolemia
Hypercholesterolemia
Macro F1 score: 0.7294354256854256 and Micro F1 Score 0.75
Hypertension
Hypertension
Macro F1 score: 0.7496844821844822 and Micro F1 Score 0.7566666666666667
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
OA
Macro F1 score: 0.8503155740887939 and Micro F1 Score 0.8589743589743589
Obes

In [8]:
with open("./results/word-embedding/performance_SVM_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow(column_headings)

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).fastText()

    X = np.abs(np.average(X, axis=1))

    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        # print(morbidity)
        # smote = SMOTE(random_state=42,k_neighbors=2)
        # X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        # X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_SVM_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_SVM_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Macro F1 score: 0.6794871794871795 and Micro F1 Score 0.9232142857142858
CAD
Macro F1 score: 0.3487049062049062 and Micro F1 Score 0.4375
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.5634981684981686 and Micro F1 Score 0.8678571428571427
Diabetes
Macro F1 score: 0.3614119214119214 and Micro F1 Score 0.5821428571428571
Gallstones
Macro F1 score: 0.6140476190476191 and Micro F1 Score 0.8714285714285713
GERD
Macro F1 score: 0.6012470862470861 and Micro F1 Score 0.8333333333333333
Gout
Macro F1 score: 0.729010989010989 and Micro F1 Score 0.9232142857142858
Hypercholesterolemia
Macro F1 score: 0.5157575757575759 and Micro F1 Score 0.7238095238095239
Hypertension
Macro F1 score: 0.5227700077700077 and Micro F1 Score 0.7595238095238095
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.5513553113553114 and Micro F1 Score 0.8339285714285716
Obesity
Macro F1 score: 0.36040792540792543 and Micro F1 Score 0.5678571428571428
OSA
Macro

In [9]:
with open("./results/word-embedding/performance_SVM_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow(column_headings)

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).universal_sentence_encoder()

    X = np.abs(np.average(X, axis=1))
    #X = X.reshape(-1, 1)
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        # print(morbidity)
        # smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        # X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        # X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_SVM_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_SVM_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-05-06 18:43:42.111695: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Macro F1 score: 0.6789743589743591 and Micro F1 Score 0.9214285714285715
CAD
Macro F1 score: 0.7528751803751804 and Micro F1 Score 0.7678571428571428
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.5880219780219781 and Micro F1 Score 0.8410714285714285
Diabetes
Macro F1 score: 0.6622061272061273 and Micro F1 Score 0.7017857142857143
Gallstones
Macro F1 score: 0.6795937395937396 and Micro F1 Score 0.8732142857142857
GERD
Macro F1 score: 0.5961810411810412 and Micro F1 Score 0.85
Gout
Macro F1 score: 0.6706776556776558 and Micro F1 Score 0.8946428571428571
Hypercholesterolemia
Macro F1 score: 0.5223953823953824 and Micro F1 Score 0.6095238095238096
Hypertension
Macro F1 score: 0.45479020979020984 and Micro F1 Score 0.6833333333333333
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.5387545787545788 and Micro F1 Score 0.7964285714285715
Obesity
Macro F1 score: 0.6590223665223665 and Micro F1 Score 0.6910714285714286
OSA
Macro F1 scor