In [1]:
import os

os.chdir('/Users/shreygupta/Documents/Classes/CS598DLH/')
os.getcwd()

'/Users/shreygupta/Documents/Classes/CS598DLH'

In [2]:

import numpy as np
import csv
import pandas as pd

from feature_generation import FeatureGeneration
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import collections
from imblearn.over_sampling import SMOTE


In [3]:
class DecisionTree:
    def __init__(self, x_train, y_train, x_test, y_test):
        self.dtc = DecisionTreeClassifier(splitter='random', random_state=42)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        # print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        self.dtc.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.dtc.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        #print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [4]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous_Insufficiency']

column_headings = ["Morbidity Class", "DT_Macro F1", "DT_Micro F1"]

with open("./results/word-embedding/performance_DT_W2V.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings)
    

In [5]:
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).word2vec()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled

        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            dt_obj = DecisionTree(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            dt_obj.train()

            f1_macro, f1_micro = dt_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_DT_W2V.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/word-embedding/performance_DT_W2V.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Macro F1 score: 0.9050565284388814 and Micro F1 Score 0.9080952380952381
CAD
Macro F1 score: 0.6176587301587302 and Micro F1 Score 0.6214285714285714
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.9026820504916481 and Micro F1 Score 0.9093406593406593
Diabetes
Macro F1 score: 0.5745870795870796 and Micro F1 Score 0.6069444444444445
Gallstones
Macro F1 score: 0.8505521357727241 and Micro F1 Score 0.8714285714285713
GERD
Macro F1 score: 0.8688633588633587 and Micro F1 Score 0.8727272727272727
Gout
Macro F1 score: 0.8860640443148183 and Micro F1 Score 0.8895238095238096
Hypercholesterolemia
Macro F1 score: 0.7509301809301808 and Micro F1 Score 0.76
Hypertension
Macro F1 score: 0.8283580308580308 and Micro F1 Score 0.8377777777777776
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.7953978039504355 and Micro F1 Score 0.8044871794871795
Obesity
Macro F1 score: 0.476998556998557 and Micro F1 Score 0.5152777777777777
OSA
Macro F1

In [6]:
with open("./results/word-embedding/performance_DT_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).gloVe()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            dt_obj = DecisionTree(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            dt_obj.train()

            f1_macro, f1_micro = dt_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_DT_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/word-embedding/performance_DT_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Macro F1 score: 0.9475840928936904 and Micro F1 Score 0.950952380952381
CAD
Macro F1 score: 0.7188275613275613 and Micro F1 Score 0.7321428571428571
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.8876053603259486 and Micro F1 Score 0.893956043956044
Diabetes
Macro F1 score: 0.6738492063492064 and Micro F1 Score 0.6791666666666667
Gallstones
Macro F1 score: 0.917681068931069 and Micro F1 Score 0.932967032967033
GERD
Macro F1 score: 0.8517917295123176 and Micro F1 Score 0.8636363636363636
Gout
Macro F1 score: 0.9482685941509471 and Micro F1 Score 0.9514285714285714
Hypercholesterolemia
Macro F1 score: 0.6976391663891663 and Micro F1 Score 0.72
Hypertension
Macro F1 score: 0.800556943056943 and Micro F1 Score 0.8144444444444444
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.8827280236103766 and Micro F1 Score 0.8910256410256411
Obesity
Macro F1 score: 0.4982539682539683 and Micro F1 Score 0.5291666666666667
OSA
Macro F1 sco

In [7]:
with open("./results/word-embedding/performance_DT_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).fastText()
    X = np.average(X, axis=1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
    
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            dt_obj = DecisionTree(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            dt_obj.train()

            f1_macro, f1_micro = dt_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_DT_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/word-embedding/performance_DT_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Macro F1 score: 0.8820925041590675 and Micro F1 Score 0.8933333333333333
CAD
Macro F1 score: 0.5914285714285714 and Micro F1 Score 0.6035714285714284
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.8727092893346763 and Micro F1 Score 0.8796703296703298
Diabetes
Macro F1 score: 0.638611111111111 and Micro F1 Score 0.6541666666666666
Gallstones
Macro F1 score: 0.8602208953756941 and Micro F1 Score 0.868131868131868
GERD
Macro F1 score: 0.8286038961038962 and Micro F1 Score 0.8363636363636363
Gout
Macro F1 score: 0.7375977651519448 and Micro F1 Score 0.7495238095238095
Hypercholesterolemia
Macro F1 score: 0.6860056610056608 and Micro F1 Score 0.72
Hypertension
Macro F1 score: 0.7461161061161061 and Micro F1 Score 0.7633333333333334
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.7923739822269233 and Micro F1 Score 0.8057692307692308
Obesity
Macro F1 score: 0.5972366522366522 and Micro F1 Score 0.6513888888888889
OSA
Macro F1 

In [8]:
with open("./results/word-embedding/performance_DT_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).universal_sentence_encoder()
    X = np.average(X, axis=1)
    #X = X.reshape(-1, 1)

    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            dt_obj = DecisionTree(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
            dt_obj.train()

            f1_macro, f1_micro = dt_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_DT_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)


with open("./results/word-embedding/performance_DT_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-05-06 18:15:20.030059: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Macro F1 score: 0.9570444884492872 and Micro F1 Score 0.959047619047619
CAD
Macro F1 score: 0.6490656565656565 and Micro F1 Score 0.6625
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Macro F1 score: 0.8783054037465803 and Micro F1 Score 0.8862637362637363
Diabetes
Macro F1 score: 0.7218722943722944 and Micro F1 Score 0.7319444444444445
Gallstones
Macro F1 score: 0.8873330835830835 and Micro F1 Score 0.9027472527472528
GERD
Macro F1 score: 0.9201190476190476 and Micro F1 Score 0.9272727272727274
Gout
Macro F1 score: 0.9093544089209725 and Micro F1 Score 0.9152380952380954
Hypercholesterolemia
Macro F1 score: 0.7375592463092462 and Micro F1 Score 0.76
Hypertension
Macro F1 score: 0.8899314574314573 and Micro F1 Score 0.8988888888888888
Hypertriglyceridemia
Macro F1 score: 1 and Micro F1 Score 1
OA
Macro F1 score: 0.8703449869703739 and Micro F1 Score 0.8756410256410257
Obesity
Macro F1 score: 0.6431945831945832 and Micro F1 Score 0.6972222222222222
OSA
Macro F1 score: 0.920908633