In [1]:
from sklearn.ensemble import RandomForestClassifier
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import FastTextFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import USEFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import GloVeFeatureGeneration
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import KFold
import csv
import pandas as pd

In [2]:
class RandomForest:
    def __init__(self, x_train, y_train, x_test, y_test, k):
        self.rf = RandomForestClassifier(n_estimators=100, random_state=42)
        self.k = k
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        self.rf.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.rf.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        #print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [3]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']
column_headings = ["Morbidity Class", "RF_Macro F1", "RF_Micro F1"]

In [4]:
with open("./results/word-embeddings-features/performance_RF_W2V_new.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = Word2VecFeatureGeneration(train_preprocessed_df, morbidity).word2vec_matrix_gen()

    X = np.abs(np.average(X, axis=1))
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_W2V_new.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_W2V_new.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(66, 494, 300) (66,) Counter({0.0: 63, 1.0: 3})
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
(62, 495, 300) (62,) Counter({0.0: 35, 1.0: 27})
(55, 300) (55,) (7, 300) (7,)
(55, 300) (55,) (7, 300) (7,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
Macro F1 score: 0.5377380952380952 and Micro F1 Score 0.5833333333333333
CHF
(12, 494, 300) (12,) Counter({1.0: 12})
(10, 300) (10,) (2, 300) (2,)
(10, 300) (10,) (2, 300) (2,)
(11, 300) (11,) (1, 300) (1,)
(11, 300) (11

In [5]:
with open("./results/word-embeddings-features/performance_RF_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = GloVeFeatureGeneration(train_preprocessed_df, morbidity).glove_matrix_gen()

    X = np.abs(np.average(X, axis=1))
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
(55, 300) (55,) (7, 300) (7,)
(55, 300) (55,) (7, 300) (7,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
Macro F1 score: 0.6966666666666667 and Micro F1 Score 0.7380952380952381
CHF
(10, 300) (10,) (2, 300) (2,)
(10, 300) (10,) (2, 300) (2,)
(11, 300) (11,) (1, 300) (1,)
(11, 300) (11,) (1, 300) (1,)
(11, 300) (11,) (1, 300) (1,)
(11, 300) (11,) (1, 300) (1,)
(11, 300) (11,) (1, 300) (1,)
(11, 300) (11,) (1, 300) (1,)


In [6]:
with open("./results/word-embeddings-features/performance_RF_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FastTextFeatureGeneration(train_preprocessed_df, morbidity).fasttext_matrix_gen()

    X = np.abs(np.average(X, axis=1))
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(66, 494, 300) (66,) Counter({0.0: 63, 1.0: 3})
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(59, 300) (59,) (7, 300) (7,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
(60, 300) (60,) (6, 300) (6,)
Macro F1 score: 0.837062937062937 and Micro F1 Score 0.9523809523809523
CAD
(62, 495, 300) (62,) Counter({0.0: 35, 1.0: 27})
(55, 300) (55,) (7, 300) (7,)
(55, 300) (55,) (7, 300) (7,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
(56, 300) (56,) (6, 300) (6,)
Macro F1 score: 0.5311904761904762 and Micro F1 Score 0.6
CHF
(12, 494, 300) (12,) Counter({1.0: 12})
(10, 300) (10,) (2, 300) (2,)
(10, 300) (10,) (2, 300) (2,)
(11, 300) (11,) (1, 300) (1,)
(11, 300) (11,) (1, 300) (1,

In [7]:
with open("./results/word-embeddings-features/performance_RF_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = USEFeatureGeneration(train_preprocessed_df, morbidity).use_matrix_gen()

    X = np.abs(np.average(X, axis=1))
    X = X.reshape(-1, 1)
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(66, 512) (66,) Counter({0.0: 63, 1.0: 3})
(59, 1) (59,) (7, 1) (7,)
(59, 1) (59,) (7, 1) (7,)
(59, 1) (59,) (7, 1) (7,)
(59, 1) (59,) (7, 1) (7,)
(59, 1) (59,) (7, 1) (7,)
(59, 1) (59,) (7, 1) (7,)
(60, 1) (60,) (6, 1) (6,)
(60, 1) (60,) (6, 1) (6,)
(60, 1) (60,) (6, 1) (6,)
(60, 1) (60,) (6, 1) (6,)
Macro F1 score: 0.6209790209790209 and Micro F1 Score 0.8928571428571427
CAD
(62, 512) (62,) Counter({0.0: 35, 1.0: 27})
(55, 1) (55,) (7, 1) (7,)
(55, 1) (55,) (7, 1) (7,)
(56, 1) (56,) (6, 1) (6,)
(56, 1) (56,) (6, 1) (6,)
(56, 1) (56,) (6, 1) (6,)
(56, 1) (56,) (6, 1) (6,)
(56, 1) (56,) (6, 1) (6,)
(56, 1) (56,) (6, 1) (6,)
(56, 1) (56,) (6, 1) (6,)
(56, 1) (56,) (6, 1) (6,)
Macro F1 score: 0.6442857142857142 and Micro F1 Score 0.6619047619047618
CHF
(12, 512) (12,) Counter({1.0: 12})
(10, 1) (10,) (2, 1) (2,)
(10, 1) (10,) (2, 1) (2,)
(11, 1) (11,) (1, 1) (1,)
(11, 1) (11,) (1, 1) (1,)
(11, 1) (11,) (1, 1) (1,)
(11, 1) (11,) (1, 1) (1,)
(11, 1) (11,) (1, 1) (1,)
(11, 1) (11,) (