In [1]:
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn import svm
import pandas as pd
import csv
import numpy as np
from statistics import mode
from imblearn.over_sampling import SMOTE
from collections import Counter
from dataset.preprocessing.tf_idf_all_feature_matrix_gen import TFIDFFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration, FastTextFeatureGeneration, USEFeatureGeneration, GloVeFeatureGeneration


In [2]:
class SVM:
    def __init__(self, x_train, y_train, x_test, y_test, k):
        self.svm = svm.SVC(kernel='linear')
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.k = k
        #print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)

    def feature_selection_ExtraTreesClassifier(self):
        clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
        clf.fit(self.x_train, self.y_train)
        importances = clf.feature_importances_
        indices = np.argsort(importances)[::-1]
        self.x_train = self.x_train[:, indices[:self.k]]
        self.x_test = self.x_test[:, indices[:self.k]]
    
    def train(self):
        self.svm.fit(self.x_train, self.y_train)

    def pred(self):
        y_pred = self.svm.predict(self.x_test)
        return y_pred

In [3]:
class KNN:
    def __init__(self, x_train, y_train, x_test, y_test, n):
        self.knn = KNeighborsClassifier(n_neighbors=n)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def train(self):
        self.knn.fit(self.x_train, self.y_train)

    def pred(self):
        y_pred = self.knn.predict(self.x_test)
        return y_pred

In [4]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']

In [5]:
column_headings = ["Morbidity Class", "E1_Macro F1", "E1_Micro F1"]

##### Ensemble with:
DL model using GloVe word embeddings  
SVM with ExtraTreesClassifier algorithm  
kNN (k = 1) with fastText word embeddings

In [6]:
with open("./results/ensembles/performance_ensemble2.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    # generate TF-IDF features and labels
    X_tfidf, Y_tfidf, words_tfidf = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()

    # resample the data using SMOTE if there are at least 2 distinct labels
    if len(Counter(list(Y_tfidf)).keys()) >= 2:
        smote = SMOTE(random_state=42, k_neighbors=2)
        X_tfidf_resampled, Y_tfidf_resampled = smote.fit_resample(X_tfidf, Y_tfidf)
        X_tfidf, Y_tfidf =  X_tfidf_resampled, Y_tfidf_resampled

        # train an SVM model on the TF-IDF features
        svm_obj = SVM(X_tfidf, Y_tfidf, X_tfidf, Y_tfidf, 100)
        svm_obj.feature_selection_ExtraTreesClassifier()
        svm_obj.train()
        y_pred_tfidf_svm = svm_obj.pred()

    # generate FastText features and labels
    X_fasttext, Y_fasttext, words_fasttext = FastTextFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()
    X_fasttext = np.average(X_fasttext, axis=1)

    # resample the data using SMOTE if there are at least 2 distinct labels
    if len(Counter(list(Y_fasttext)).keys()) >= 2:
        smote = SMOTE(random_state=42, k_neighbors=min(1, len(X_fasttext)-1))
        X_fasttext_resampled, Y_fasttext_resampled = smote.fit_resample(X_fasttext, Y_fasttext)
        X_fasttext, Y_fasttext =  X_fasttext_resampled, Y_fasttext_resampled

        # train a KNN model on the FastText features
        knn_obj = KNN(X_fasttext, Y_fasttext, X_fasttext, Y_fasttext, 1)
        knn_obj.train()
        y_pred_fasttext_knn = knn_obj.pred()

    y_preds = [y_pred_tfidf_svm, y_pred_fasttext_knn]
    
    min_length = min(len(y_pred) for y_pred in y_preds)
    # trim all y_pred lists to the shortest length
    y_preds = [y_pred[:min_length] for y_pred in y_preds]

    # perform majority vote
    majority_vote = [mode([y_pred[i] for y_pred in y_preds]) for i in range(min_length)]

    f1_macro = f1_score(Y_tfidf[:min_length], majority_vote, average='macro')
    f1_micro = f1_score(Y_tfidf[:min_length], majority_vote, average='micro')

    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/ensembles/performance_ensemble2.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/ensembles/performance_ensemble2.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)


Asthma
(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
Macro F1 score: 0.8140678799803247 and Micro F1 Score 0.9047619047619048
CAD
(548, 600) (548,) Counter({1.0: 325, 0.0: 223})
Macro F1 score: 0.8707692307692307 and Micro F1 Score 0.8714285714285714
CHF
(243, 600) (243,) Counter({1.0: 243})
Macro F1 score: 0.3333333333333333 and Micro F1 Score 0.5
Depression
(582, 600) (582,) Counter({0.0: 460, 1.0: 122})
Macro F1 score: 0.6680565775488512 and Micro F1 Score 0.6982758620689655
Diabetes
(567, 600) (567,) Counter({1.0: 396, 0.0: 171})
Macro F1 score: 0.8652406417112299 and Micro F1 Score 0.8714285714285714
Gallstones
(593, 600) (593,) Counter({0.0: 506, 1.0: 87})
Macro F1 score: 0.5843749999999999 and Micro F1 Score 0.6607142857142857
GERD
(487, 600) (487,) Counter({0.0: 372, 1.0: 115})
Macro F1 score: 0.6250164929410212 and Micro F1 Score 0.7040816326530612
Gout
(596, 600) (596,) Counter({0.0: 518, 1.0: 78})
Macro F1 score: 0.7494949494949494 and Micro F1 Score 0.8387096774193549
Hype