In [1]:
from sklearn.ensemble import RandomForestClassifier
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import FastTextFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import USEFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import GloVeFeatureGeneration
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import KFold
import csv
import pandas as pd

In [2]:
class RandomForest:
    def __init__(self, x_train, y_train, x_test, y_test, k):
        self.rf = RandomForestClassifier(n_estimators=100, random_state=42)
        self.k = k
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        self.rf.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.rf.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        #print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [3]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']
column_headings = ["Morbidity Class", "RF_Macro F1", "RF_Micro F1"]

In [4]:
with open("./results/word-embeddings-features/performance_RF_W2V.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = Word2VecFeatureGeneration(train_preprocessed_df, morbidity).word2vec_matrix_gen()

    X = np.abs(np.average(X, axis=1))
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_W2V.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_W2V.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(572, 3570, 100) (572,) Counter({0.0: 502, 1.0: 70})
(514, 100) (514,) (58, 100) (58,)
(514, 100) (514,) (58, 100) (58,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
Macro F1 score: 0.4672986632039369 and Micro F1 Score 0.8776164549304294
CAD
(548, 3570, 100) (548,) Counter({1.0: 325, 0.0: 223})
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(494, 100) (494,) (54, 100) (54,)
(494, 100) (494,) (54, 100) (54,)
Macro F1 score: 0.6744190438033495 and Micro F1 Score 0.6988888888888889
CHF
(243, 3122, 100) (243,) Counter({1.0: 243})
(218, 10

In [6]:
with open("./results/word-embeddings-features/performance_RF_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = GloVeFeatureGeneration(train_preprocessed_df, morbidity).glove_matrix_gen()

    X = np.abs(np.average(X, axis=1))
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(514, 100) (514,) (58, 100) (58,)
(514, 100) (514,) (58, 100) (58,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
Macro F1 score: 0.4838850779784251 and Micro F1 Score 0.8776164549304294
CAD
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(494, 100) (494,) (54, 100) (54,)
(494, 100) (494,) (54, 100) (54,)
Macro F1 score: 0.6502808689831712 and Micro F1 Score 0.6842424242424243
CHF
(218, 100) (218,) (25, 100) (25,)
(218, 100) (218,) (25, 100) (25,)
(218, 100) (218,) (25, 100) (25,)
(219, 100) (219,) (24, 100) (24,)
(219, 100) (219,) (24, 

In [7]:
with open("./results/word-embeddings-features/performance_RF_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FastTextFeatureGeneration(train_preprocessed_df, morbidity).fasttext_matrix_gen()

    X = np.abs(np.average(X, axis=1))
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(572, 3570, 100) (572,) Counter({0.0: 502, 1.0: 70})
(514, 100) (514,) (58, 100) (58,)
(514, 100) (514,) (58, 100) (58,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
(515, 100) (515,) (57, 100) (57,)
Macro F1 score: 0.4672986632039369 and Micro F1 Score 0.8776164549304294
CAD
(548, 3570, 100) (548,) Counter({1.0: 325, 0.0: 223})
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(493, 100) (493,) (55, 100) (55,)
(494, 100) (494,) (54, 100) (54,)
(494, 100) (494,) (54, 100) (54,)
Macro F1 score: 0.6739018373901517 and Micro F1 Score 0.6971043771043772
CHF
(243, 3122, 100) (243,) Counter({1.0: 243})
(218, 10

In [8]:
with open("./results/word-embeddings-features/performance_RF_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = USEFeatureGeneration(train_preprocessed_df, morbidity).use_matrix_gen()

    X = np.abs(np.average(X, axis=1))
    X = X.reshape(-1, 1)
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training RF using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma


2023-04-30 19:36:05.282774: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


(572, 512) (572,) Counter({0.0: 502, 1.0: 70})
(514, 1) (514,) (58, 1) (58,)
(514, 1) (514,) (58, 1) (58,)
(515, 1) (515,) (57, 1) (57,)
(515, 1) (515,) (57, 1) (57,)
(515, 1) (515,) (57, 1) (57,)
(515, 1) (515,) (57, 1) (57,)
(515, 1) (515,) (57, 1) (57,)
(515, 1) (515,) (57, 1) (57,)
(515, 1) (515,) (57, 1) (57,)
(515, 1) (515,) (57, 1) (57,)
Macro F1 score: 0.4810377605755761 and Micro F1 Score 0.7743496672716275
CAD
(548, 512) (548,) Counter({1.0: 325, 0.0: 223})
(493, 1) (493,) (55, 1) (55,)
(493, 1) (493,) (55, 1) (55,)
(493, 1) (493,) (55, 1) (55,)
(493, 1) (493,) (55, 1) (55,)
(493, 1) (493,) (55, 1) (55,)
(493, 1) (493,) (55, 1) (55,)
(493, 1) (493,) (55, 1) (55,)
(493, 1) (493,) (55, 1) (55,)
(494, 1) (494,) (54, 1) (54,)
(494, 1) (494,) (54, 1) (54,)
Macro F1 score: 0.5549841575818227 and Micro F1 Score 0.5765656565656566
CHF
(243, 512) (243,) Counter({1.0: 243})
(218, 1) (218,) (25, 1) (25,)
(218, 1) (218,) (25, 1) (25,)
(218, 1) (218,) (25, 1) (25,)
(219, 1) (219,) (24, 1)