In [1]:
from sklearn import svm
import os
os.chdir('/Users/kshitijphulare/DLH-Proj/')
from pre_processing import *
from main import *
# from dataset.preprocessing.tf_idf_all_feature_matrix_gen import TFIDFFeatureGeneration
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import KFold
import csv
import pandas as pd


In [2]:
class SVM:
    def __init__(self, x_train, y_train, x_test, y_test):
        self.svm = svm.SVC(kernel='linear')
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        
        self.svm.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.svm.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [3]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous_Insufficiency']

In [4]:
column_headings = ["Morbidity Class", "SVM_Macro F1", "SVM_Micro F1"]

with open("./performance_SVM.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
total_macro=[]
total_micro=[]
for morbidity in morbidities:
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = tf_idf(train_preprocessed_df, morbidity)
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    total_macro.append(f1_macro)
    total_micro.append(f1_micro)
    with open("./performance_SVM.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./performance_SVM.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ['Average']
    row.extend([sum(total_macro)/len(total_macro),sum(total_micro)/len(total_micro)])
    writer.writerow(row)


(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
(514, 600) (514,) (58, 600) (58,)
Macro F1 score: 0.46296296296296297 and Micro F1 Score 0.8620689655172413
(514, 600) (514,) (58, 600) (58,)
Macro F1 score: 0.6199213630406291 and Micro F1 Score 0.9137931034482759
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.46226415094339623 and Micro F1 Score 0.8596491228070176
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.5615384615384615 and Micro F1 Score 0.8596491228070176
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.4770642201834862 and Micro F1 Score 0.9122807017543859
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.4770642201834862 and Micro F1 Score 0.9122807017543859
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.6481481481481481 and Micro F1 Score 0.9298245614035088
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.46226415094339623 and Micro F1 Score 0.8596491228070176
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.5966981132075473 and Micro F1 Score 0.8947

Macro F1 score: 0.6802507836990597 and Micro F1 Score 0.6862745098039216
(451, 600) (451,) (51, 600) (51,)
Macro F1 score: 0.7166666666666667 and Micro F1 Score 0.7254901960784313
(452, 600) (452,) (50, 600) (50,)
Macro F1 score: 0.7584541062801933 and Micro F1 Score 0.76
(452, 600) (452,) (50, 600) (50,)
Macro F1 score: 0.5798319327731093 and Micro F1 Score 0.58
(452, 600) (452,) (50, 600) (50,)
Macro F1 score: 0.7799119647859143 and Micro F1 Score 0.78
(452, 600) (452,) (50, 600) (50,)
Macro F1 score: 0.8798076923076923 and Micro F1 Score 0.88
(452, 600) (452,) (50, 600) (50,)
Macro F1 score: 0.7398959583833533 and Micro F1 Score 0.74
(452, 600) (452,) (50, 600) (50,)
Macro F1 score: 0.7267759562841531 and Micro F1 Score 0.74
(452, 600) (452,) (50, 600) (50,)
Macro F1 score: 0.52 and Micro F1 Score 0.52
(452, 600) (452,) (50, 600) (50,)
Macro F1 score: 0.6969696969696968 and Micro F1 Score 0.7
Macro F1 score: 0.7078564758149838 and Micro F1 Score 0.7111764705882353
(531, 600) (531,) 

Macro F1 score: 0.49019607843137253 and Micro F1 Score 0.9615384615384616
(474, 600) (474,) (52, 600) (52,)
Macro F1 score: 0.4639175257731959 and Micro F1 Score 0.8653846153846154
(474, 600) (474,) (52, 600) (52,)
Macro F1 score: 0.48514851485148514 and Micro F1 Score 0.9423076923076923
Macro F1 score: 0.47801077753999144 and Micro F1 Score 0.9164368650217707
