In [1]:
import os
os.chdir(os.path.join(os.getcwd(), '..','..', '..'))

In [12]:
from dataset.preprocessing.tf_idf_all_feature_matrix_gen import TFIDFFeatureGeneration
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score
import pandas as pd
import csv
import numpy as np
import collections
from sklearn import svm
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.filters import Filter
from weka.attribute_selection import ASEvaluation, AttributeSelection
from weka.classifiers import Classifier, Evaluation
from imblearn.over_sampling import SMOTE

jvm.start()

In [6]:
class SVM:
    def __init__(self, x_train, y_train, x_test, y_test, k):
        self.svm = svm.SVC(kernel='linear')
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.k = k
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)

    def feature_selection_SelectKBest(self):
        k_best = SelectKBest(mutual_info_classif, k=self.k)
        k_best.fit(self.x_train, self.y_train)
        self.x_train = k_best.transform(self.x_train)
        self.x_test = k_best.transform(self.x_test)
        
    def feature_selection_ExtraTreesClassifier(self):
        clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
        clf.fit(self.x_train, self.y_train)
        importances = clf.feature_importances_
        indices = np.argsort(importances)[::-1]
        self.x_train = self.x_train[:, indices[:self.k]]
        self.x_test = self.x_test[:, indices[:self.k]]
    
    def feature_selection_InfoGainAttributeEval(self, morbidity):
        loader = Loader(classname="weka.core.converters.ArffLoader")
        train_data = loader.load_file(f"./dataset/train/train_{morbidity}_tfidf.arff")
        train_data.class_is_last()

        # Initialize attribute selection
        eval = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval")
        search = AttributeSelection()
        search.evaluator = eval
        search.select_attributes(train_data)
        selected_attributes = search.selected_attributes
        filtered_attributes = np.delete(selected_attributes, [-1])
        # print("Selected attributes:", type(filtered_attributes), filtered_attributes.shape)

        # Apply selected attributes to the training and testing sets
        self.x_train = self.x_train[:, filtered_attributes]
        self.x_test = self.x_test[:, filtered_attributes]
        
    def train(self):
        self.svm.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.svm.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [7]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']
column_headings = ["Morbidity Class", "SVM_Macro F1", "SVM_Micro F1"]

In [8]:
with open("./results/tf-idf-features/performance_SVM_AllFeatures.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 0)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("./results/tf-idf-features/performance_SVM_AllFeatures.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf-features/performance_SVM_AllFeatures.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)
    

(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
Asthma
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.9306658821221928 and Micro F1 Score 0.9306930693069307
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.9504950495049506 and Micro F1 Score 0.9504950495049505
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.9501824997533788 and Micro F1 Score 0.9504950495049505
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.9305841924398626 and Micro F1 Score 0.9306930693069307
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.9499949994999499 and Micro F1 Score 0.9500000000000001
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.98 and Micro F1 Score 0.98
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.949753793588584 and Micro F1 Score 0.9500000000000001
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.968324358568261 and Micro F1 Score 0.97
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.9495916927109587 and Micro F1 Score 0.9500000000000001
(904, 600

In [9]:
with open("./results/tf-idf-features/performance_SVM_SelectKBest.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            svm_obj.feature_selection_SelectKBest()
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("./results/tf-idf-features/performance_SVM_SelectKBest.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf-features/performance_SVM_SelectKBest.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)
    

(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
Asthma
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.85630081300813 and Micro F1 Score 0.8613861386138614
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.8708312838170191 and Micro F1 Score 0.8712871287128713
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.8904014994574332 and Micro F1 Score 0.8910891089108911
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.8806146572104019 and Micro F1 Score 0.8811881188118812
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.7241086587436332 and Micro F1 Score 0.74
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.8891017239641092 and Micro F1 Score 0.89
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.7406369955389562 and Micro F1 Score 0.75
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.8443821973233737 and Micro F1 Score 0.85
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.8594941790445605 and Micro F1 Score 0.8599999999999999
(904, 600) (904,) (100,

In [10]:
with open("./results/tf-idf-features/performance_SVM_ExtraTreesClassifier.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            svm_obj.feature_selection_ExtraTreesClassifier()
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("./results/tf-idf-features/performance_SVM_ExtraTreesClassifier.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf-features/performance_SVM_ExtraTreesClassifier.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)
    

(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
Asthma
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.9205974842767295 and Micro F1 Score 0.9207920792079208
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.8811764705882352 and Micro F1 Score 0.8811881188118812
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.87001287001287 and Micro F1 Score 0.8712871287128713
(903, 600) (903,) (101, 600) (101,)
Macro F1 score: 0.8907033939990162 and Micro F1 Score 0.8910891089108911
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.8894583458948849 and Micro F1 Score 0.89
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.8696741854636592 and Micro F1 Score 0.87
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.8999599839935974 and Micro F1 Score 0.9
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.8958333333333333 and Micro F1 Score 0.9
(904, 600) (904,) (100, 600) (100,)
Macro F1 score: 0.8498648783905516 and Micro F1 Score 0.85
(904, 600) (904,) (100, 600) (100,)
Mac

In [8]:
with open("./results/tf-idf-features/performance_SVM_InfoGain.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            svm_obj = SVM(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            svm_obj.feature_selection_InfoGainAttributeEval(morbidity)
            svm_obj.train()

            f1_macro, f1_micro = svm_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("./results/tf-idf-features/performance_SVM_InfoGain.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf-features/performance_SVM_InfoGain.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)
    

(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
(514, 600) (514,) (58, 600) (58,)
Macro F1 score: 0.46296296296296297 and Micro F1 Score 0.8620689655172413
(514, 600) (514,) (58, 600) (58,)
Macro F1 score: 0.6199213630406291 and Micro F1 Score 0.9137931034482759
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.46226415094339623 and Micro F1 Score 0.8596491228070176
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.5615384615384615 and Micro F1 Score 0.8596491228070176
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.4770642201834862 and Micro F1 Score 0.9122807017543859
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.4770642201834862 and Micro F1 Score 0.9122807017543859
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.7716955941255007 and Micro F1 Score 0.9473684210526315
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.46226415094339623 and Micro F1 Score 0.8596491228070176
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.5966981132075473 and Micro F1 Score 0.8947