In [1]:
from sklearn import svm
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import GloVeFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import FastTextFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import USEFeatureGeneration
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import pandas as pd
import csv
import numpy as np
import collections
from imblearn.over_sampling import SMOTE



In [2]:
class SVM:
    def __init__(self, x_train, y_train, x_test, y_test):
        self.svm = svm.SVC(kernel='linear')
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        # print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        
        self.svm.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.svm.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        # print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [3]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']

In [4]:
column_headings = ["Morbidity Class", "SVM_Macro F1", "SVM_Micro F1"]

with open("./results/word-embeddings-features/performance_SVM_W2V.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = Word2VecFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()
    X = np.average(X, axis=1)
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("./results/word-embeddings-features/performance_SVM_W2V.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_SVM_W2V.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)
    

(66, 494, 300) (66,) Counter({0.0: 63, 1.0: 3})
Asthma
Macro F1 score: 0.7417165350988879 and Micro F1 Score 0.7698717948717948
(62, 495, 300) (62,) Counter({0.0: 35, 1.0: 27})
CAD
Macro F1 score: 0.3607936507936508 and Micro F1 Score 0.4428571428571428
(12, 494, 300) (12,) Counter({1.0: 12})
Macro F1 score: 1 and Micro F1 Score 1
(66, 495, 300) (66,) Counter({0.0: 58, 1.0: 8})
Depression
Macro F1 score: 0.7691737510855157 and Micro F1 Score 0.7856060606060606
(63, 495, 300) (63,) Counter({1.0: 35, 0.0: 28})
Diabetes
Macro F1 score: 0.34277777777777774 and Micro F1 Score 0.3999999999999999
(66, 495, 300) (66,) Counter({0.0: 56, 1.0: 10})
Gallstones
Macro F1 score: 0.35837024087024083 and Micro F1 Score 0.421969696969697
(56, 495, 300) (56,) Counter({0.0: 49, 1.0: 7})
GERD
Macro F1 score: 0.5697502497502498 and Micro F1 Score 0.6211111111111111
(67, 495, 300) (67,) Counter({0.0: 62, 1.0: 5})
Gout
Macro F1 score: 0.5620278916989443 and Micro F1 Score 0.5987179487179488
(59, 495, 300) (59

In [5]:
with open("./results/word-embeddings-features/performance_SVM_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = GloVeFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()

    X = np.abs(np.average(X, axis=1))

    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_SVM_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_SVM_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Asthma
Macro F1 score: 0.9745098039215687 and Micro F1 Score 0.976923076923077
CAD
CAD
Macro F1 score: 0.7602561327561327 and Micro F1 Score 0.7857142857142857
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
Macro F1 score: 0.7825955988455988 and Micro F1 Score 0.8098484848484848
Diabetes
Diabetes
Macro F1 score: 0.5025793650793651 and Micro F1 Score 0.5285714285714286
Gallstones
Gallstones
Macro F1 score: 0.9085836385836386 and Micro F1 Score 0.9106060606060605
GERD
GERD
Macro F1 score: 0.8068537018537019 and Micro F1 Score 0.8155555555555555
Gout
Gout
Macro F1 score: 0.9748251748251748 and Micro F1 Score 0.975
Hypercholesterolemia
Hypercholesterolemia
Macro F1 score: 0.7888816738816739 and Micro F1 Score 0.8055555555555556
Hypertension
Hypertension
Macro F1 score: 0.8038023088023089 and Micro F1 Score 0.8222222222222222
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
OA
Macro F1 score: 0.8459182484182485 and Micro F1 Score 0.8537878787878789
Obe

In [6]:
with open("./results/word-embeddings-features/performance_SVM_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FastTextFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()

    X = np.abs(np.average(X, axis=1))

    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        # print(morbidity)
        # smote = SMOTE(random_state=42,k_neighbors=2)
        # X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        # X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_SVM_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_SVM_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
Macro F1 score: 0.3577272727272727 and Micro F1 Score 0.5642857142857143
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.6665501165501164 and Micro F1 Score 0.880952380952381
Diabetes
Macro F1 score: 0.3206493506493507 and Micro F1 Score 0.48809523809523814
Gallstones
Macro F1 score: 0.655128205128205 and Micro F1 Score 0.85
GERD
Macro F1 score: 0.6662626262626263 and Micro F1 Score 0.8800000000000001
Gout
Macro F1 score: 0.7777622377622377 and Micro F1 Score 0.9214285714285715
Hypercholesterolemia
Macro F1 score: 0.4647727272727272 and Micro F1 Score 0.7266666666666667
Hypertension
Macro F1 score: 0.5195202020202021 and Micro F1 Score 0.74
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.6110372960372961 and Micro F1 Score 0.8619047619047618
Obesity
Macro F1 score: 0.47785714285714287 and Micro F1 Score 0.5666666666666667
OSA
Macro F1 score: 0.6037

In [7]:
with open("./results/word-embeddings-features/performance_SVM_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = USEFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()

    X = np.abs(np.average(X, axis=1))
    #X = X.reshape(-1, 1)
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        # print(morbidity)
        # smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        # X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        # X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_SVM_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_SVM_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma


2023-05-04 20:06:23.976907: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
Macro F1 score: 0.8006349206349206 and Micro F1 Score 0.8214285714285715
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.6072494172494173 and Micro F1 Score 0.85
Diabetes
Macro F1 score: 0.6358730158730158 and Micro F1 Score 0.669047619047619
Gallstones
Macro F1 score: 0.6679720279720279 and Micro F1 Score 0.819047619047619
GERD
Macro F1 score: 0.5983080808080808 and Micro F1 Score 0.8233333333333333
Gout
Macro F1 score: 0.7777622377622377 and Micro F1 Score 0.9214285714285715
Hypercholesterolemia
Macro F1 score: 0.4292640692640693 and Micro F1 Score 0.6133333333333333
Hypertension
Macro F1 score: 0.5072979797979797 and Micro F1 Score 0.6799999999999999
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.595944055944056 and Micro F1 Score 0.819047619047619
Obesity
Macro F1 score: 0.622478354978355 and Micro F1 Score 0.6761904761904762
OSA
Macro F1 score: 0.69