In [2]:
from sklearn.ensemble import RandomForestClassifier
import os

os.chdir('/Users/shreygupta/Documents/Classes/CS598DLH/')
os.getcwd()
from feature_generation import FeatureGeneration
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import KFold
import csv
import pandas as pd
import collections
from imblearn.over_sampling import SMOTE

In [3]:
class RandomForest:
    def __init__(self, x_train, y_train, x_test, y_test, k):
        self.rf = RandomForestClassifier(n_estimators=100, random_state=42)
        self.k = k
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        self.rf.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.rf.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        #print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [4]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous_Insufficiency']
column_headings = ["Morbidity Class", "RF_Macro F1", "RF_Micro F1"]

In [5]:
with open("./results/word-embedding/performance_RF_W2V_new.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow(column_headings)

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).word2vec()

    X = np.abs(np.average(X, axis=1))
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_RF_W2V_new.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_RF_W2V_new.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Asthma
(127, 300) (127,) (15, 300) (15,)
(127, 300) (127,) (15, 300) (15,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
Macro F1 score: 0.9276192924722336 and Micro F1 Score 0.9295238095238094
CAD
CAD
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
Macro F1 score: 0.6461507936507936 and Micro F1 Score 0.6625
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
(118, 300) (118,) (14, 300) (14,)
(118, 300) (118,) (14, 300) (14,)
(119, 300) (119,) (13, 300) (13,)
(119, 300) (119,) (13, 300) (13,)
(11

In [6]:
with open("./results/word-embedding/performance_RF_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow(column_headings)

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).gloVe()

    X = np.abs(np.average(X, axis=1))
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_RF_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_RF_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Asthma
(127, 300) (127,) (15, 300) (15,)
(127, 300) (127,) (15, 300) (15,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
Macro F1 score: 1.0 and Micro F1 Score 1.0
CAD
CAD
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
Macro F1 score: 0.7702777777777778 and Micro F1 Score 0.7749999999999999
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
(118, 300) (118,) (14, 300) (14,)
(118, 300) (118,) (14, 300) (14,)
(119, 300) (119,) (13, 300) (13,)
(119, 300) (119,) (13, 300) (13,)
(119, 300) (119,) (13

In [7]:
with open("./results/word-embedding/performance_RF_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow(column_headings)

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).fastText()

    X = np.abs(np.average(X, axis=1))
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_RF_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_RF_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Asthma
(127, 300) (127,) (15, 300) (15,)
(127, 300) (127,) (15, 300) (15,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
Macro F1 score: 0.8757326140491776 and Micro F1 Score 0.8799999999999999
CAD
CAD
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
Macro F1 score: 0.6318650793650793 and Micro F1 Score 0.6375
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
(118, 300) (118,) (14, 300) (14,)
(118, 300) (118,) (14, 300) (14,)
(119, 300) (119,) (13, 300) (13,)
(119, 300) (119,) (13, 300) (13,)
(11

In [8]:
with open("./results/word-embedding/performance_RF_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow(column_headings)

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).universal_sentence_encoder()

    X = np.abs(np.average(X, axis=1))
    #X = X.reshape(-1, 1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embedding/performance_RF_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embedding/performance_RF_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-05-06 18:40:47.216684: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Asthma
(127, 300) (127,) (15, 300) (15,)
(127, 300) (127,) (15, 300) (15,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
(128, 300) (128,) (14, 300) (14,)
Macro F1 score: 1.0 and Micro F1 Score 1.0
CAD
CAD
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(68, 300) (68,) (8, 300) (8,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
(69, 300) (69,) (7, 300) (7,)
Macro F1 score: 0.7410317460317462 and Micro F1 Score 0.7482142857142857
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
(118, 300) (118,) (14, 300) (14,)
(118, 300) (118,) (14, 300) (14,)
(119, 300) (119,) (13, 300) (13,)
(119, 300) (119,) (13, 300) (13,)
(119, 300) (119,) (13, 300) 