In [1]:
import os
import numpy as np
import csv
import pandas as pd
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import GloVeFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import FastTextFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import USEFeatureGeneration
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import collections
from imblearn.over_sampling import SMOTE


In [2]:
class DecisionTree:
    def __init__(self, x_train, y_train, x_test, y_test):
        self.dtc = DecisionTreeClassifier(splitter='random', random_state=42)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        # print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        self.dtc.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.dtc.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        #print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [3]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']

column_headings = ["Morbidity Class", "DT_Macro F1", "DT_Micro F1"]

with open("./results/word-embeddings-features/performance_DT_W2V.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    

In [4]:
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = Word2VecFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled

        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            dt_obj = DecisionTree(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            dt_obj.train()

            f1_macro, f1_micro = dt_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_DT_W2V.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/word-embeddings-features/performance_DT_W2V.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(66, 494, 300) (66,) Counter({0.0: 63, 1.0: 3})
Macro F1 score: 0.9493505187622835 and Micro F1 Score 0.9525641025641025
CAD
(62, 495, 300) (62,) Counter({0.0: 35, 1.0: 27})
Macro F1 score: 0.528015873015873 and Micro F1 Score 0.5571428571428572
CHF
(12, 494, 300) (12,) Counter({1.0: 12})
Macro F1 score: 1 and Micro F1 Score 1
Depression
(66, 495, 300) (66,) Counter({0.0: 58, 1.0: 8})
Macro F1 score: 0.9527940115440116 and Micro F1 Score 0.9568181818181818
Diabetes
(63, 495, 300) (63,) Counter({1.0: 35, 0.0: 28})
Macro F1 score: 0.7114033189033189 and Micro F1 Score 0.7428571428571429
Gallstones
(66, 495, 300) (66,) Counter({0.0: 56, 1.0: 10})
Macro F1 score: 0.8259669497169497 and Micro F1 Score 0.8378787878787879
GERD
(56, 495, 300) (56,) Counter({0.0: 49, 1.0: 7})
Macro F1 score: 0.9496969696969696 and Micro F1 Score 0.95
Gout
(67, 495, 300) (67,) Counter({0.0: 62, 1.0: 5})
Macro F1 score: 0.8925067171390701 and Micro F1 Score 0.9038461538461536
Hypercholesterolemia
(59, 495,

In [5]:
with open("./results/word-embeddings-features/performance_DT_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = GloVeFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            dt_obj = DecisionTree(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            dt_obj.train()

            f1_macro, f1_micro = dt_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_DT_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/word-embeddings-features/performance_DT_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Macro F1 score: 0.8681533825651473 and Micro F1 Score 0.875
CAD
Macro F1 score: 0.7368253968253968 and Micro F1 Score 0.7571428571428571
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.9166673604173603 and Micro F1 Score 0.9212121212121211
Diabetes
Macro F1 score: 0.6395238095238096 and Micro F1 Score 0.6714285714285715
Gallstones
Macro F1 score: 0.8969446781288888 and Micro F1 Score 0.9030303030303031
GERD
Macro F1 score: 0.8990284715284715 and Micro F1 Score 0.9066666666666668
Gout
Macro F1 score: 0.943002786686997 and Micro F1 Score 0.9455128205128205
Hypercholesterolemia
Macro F1 score: 0.7187518037518038 and Micro F1 Score 0.7430555555555556
Hypertension
Macro F1 score: 0.8696320346320346 and Micro F1 Score 0.8819444444444444
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.8873632413338296 and Micro F1 Score 0.8962121212121211
Obesity
Macro F1 score: 0.7019227994227994 and Micro F1 Score 0.7571428571428571
OSA
Macro F

In [6]:
with open("./results/word-embeddings-features/performance_DT_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FastTextFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
    
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            dt_obj = DecisionTree(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            dt_obj.train()

            f1_macro, f1_micro = dt_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_DT_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/word-embeddings-features/performance_DT_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Macro F1 score: 0.8693677237794886 and Micro F1 Score 0.8737179487179487
CAD
Macro F1 score: 0.7200180375180375 and Micro F1 Score 0.7428571428571427
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.835956314926903 and Micro F1 Score 0.8446969696969695
Diabetes
Macro F1 score: 0.6044444444444443 and Micro F1 Score 0.6285714285714286
Gallstones
Macro F1 score: 0.8252970640470642 and Micro F1 Score 0.8393939393939395
GERD
Macro F1 score: 0.7910282772782773 and Micro F1 Score 0.8244444444444445
Gout
Macro F1 score: 0.8404380964752483 and Micro F1 Score 0.848076923076923
Hypercholesterolemia
Macro F1 score: 0.8229792429792429 and Micro F1 Score 0.8375
Hypertension
Macro F1 score: 0.7231060606060605 and Micro F1 Score 0.7430555555555556
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.8305566329095742 and Micro F1 Score 0.843939393939394
Obesity
Macro F1 score: 0.5021428571428571 and Micro F1 Score 0.5428571428571428
OSA
Macro F1

In [8]:
with open("./results/word-embeddings-features/performance_DT_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = USEFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()
    X = np.average(X, axis=1)
    #X = X.reshape(-1, 1)

    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            dt_obj = DecisionTree(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            dt_obj.train()

            f1_macro, f1_micro = dt_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_DT_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/word-embeddings-features/performance_DT_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Macro F1 score: 0.9923076923076923 and Micro F1 Score 0.9923076923076923
CAD
Macro F1 score: 0.7840873015873016 and Micro F1 Score 0.7999999999999999
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.9060853035853036 and Micro F1 Score 0.9136363636363635
Diabetes
Macro F1 score: 0.5643650793650793 and Micro F1 Score 0.6
Gallstones
Macro F1 score: 0.8556424621130503 and Micro F1 Score 0.8659090909090909
GERD
Macro F1 score: 0.8800880165586049 and Micro F1 Score 0.8955555555555555
Gout
Macro F1 score: 0.8806481100598749 and Micro F1 Score 0.8955128205128204
Hypercholesterolemia
Macro F1 score: 0.779383116883117 and Micro F1 Score 0.7930555555555555
Hypertension
Macro F1 score: 0.8359632034632035 and Micro F1 Score 0.8472222222222221
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.8784896231219761 and Micro F1 Score 0.8878787878787879
Obesity
Macro F1 score: 0.6304761904761904 and Micro F1 Score 0.6571428571428571
OSA
Macro F1 