In [21]:
from sklearn import svm
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.data_preprocessing import DataPreprocessing
from dataset.preprocessing.tf_idf_all_feature_matrix_gen import TFIDFFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import pandas as pd
import csv
import numpy as np


In [27]:
class SVM:
    def __init__(self, x_train, y_train, x_test, y_test):
        self.svm = svm.SVC(kernel='linear')
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        
        self.svm.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.svm.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [28]:
# dataPreprocessing = DataPreprocessing()
# dataPreprocessing.preprocess_data('./dataset/train/train_data_intuitive.csv', 'preprocessed_train.csv')
# dataPreprocessing.preprocess_data('./dataset/test/test_data_intuitive.csv', 'preprocessed_test.csv')


In [29]:
train_preprocessed_df = pd.read_csv('preprocessed_train.csv')
test_preprocessed_df  = pd.read_csv('preprocessed_test.csv')

In [30]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']

In [31]:
column_headings = ["Morbidity Class", "SVM_Macro F1", "SVM_Micro F1"]

with open("performance_SVM.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
for morbidity in morbidities:
    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()

    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
        svm_obj.train()

        f1_macro, f1_micro = svm_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("performance_SVM.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


    

(611, 600) (611,) Counter({0.0: 502, 1.0: 70, -1.0: 39})
(549, 600) (549,) (62, 600) (62,)
Macro F1 score: 0.3067846607669616 and Micro F1 Score 0.8387096774193549
(550, 600) (550,) (61, 600) (61,)
Macro F1 score: 0.2830188679245283 and Micro F1 Score 0.7377049180327869
(550, 600) (550,) (61, 600) (61,)
Macro F1 score: 0.30357142857142855 and Micro F1 Score 0.8360655737704918
(550, 600) (550,) (61, 600) (61,)
Macro F1 score: 0.40154440154440146 and Micro F1 Score 0.8524590163934426
(550, 600) (550,) (61, 600) (61,)
Macro F1 score: 0.30357142857142855 and Micro F1 Score 0.8360655737704918
(550, 600) (550,) (61, 600) (61,)
Macro F1 score: 0.29696969696969694 and Micro F1 Score 0.8032786885245902
(550, 600) (550,) (61, 600) (61,)
Macro F1 score: 0.3896396396396396 and Micro F1 Score 0.8524590163934426
(550, 600) (550,) (61, 600) (61,)
Macro F1 score: 0.453958545701665 and Micro F1 Score 0.8524590163934426
(550, 600) (550,) (61, 600) (61,)
Macro F1 score: 0.31304347826086953 and Micro F1 S