In [1]:
from sklearn.ensemble import RandomForestClassifier
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import FastTextFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import USEFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import GloVeFeatureGeneration
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import KFold
import csv
import pandas as pd
import collections
from imblearn.over_sampling import SMOTE

In [2]:
class RandomForest:
    def __init__(self, x_train, y_train, x_test, y_test, k):
        self.rf = RandomForestClassifier(n_estimators=100, random_state=42)
        self.k = k
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        self.rf.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.rf.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        #print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [3]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']
column_headings = ["Morbidity Class", "RF_Macro F1", "RF_Micro F1"]

In [4]:
with open("./results/word-embeddings-features/performance_RF_W2V_new.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = Word2VecFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()

    X = np.abs(np.average(X, axis=1))
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_W2V_new.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_W2V_new.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(66, 494, 300) (66,) Counter({0.0: 63, 1.0: 3})
Asthma
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
Macro F1 score: 0.9572293066410713 and Micro F1 Score 0.9602564102564102
CAD
(62, 495, 300) (62,) Counter({0.0: 35, 1.0: 27})
CAD
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
Macro F1 score: 0.595573593073593 and Micro F1 Score 0.6142857142857142
CHF
(12, 494, 300) (12,) Counter({1.0: 12})
Macro F1 score: 1 and Micro F1 Score 1
Depression
(6

In [5]:
with open("./results/word-embeddings-features/performance_RF_Glove.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = GloVeFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()

    X = np.abs(np.average(X, axis=1))
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_Glove.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_Glove.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Asthma
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
Macro F1 score: 0.9745098039215687 and Micro F1 Score 0.976923076923077
CAD
CAD
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
Macro F1 score: 0.7551767676767677 and Micro F1 Score 0.7857142857142857
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 300

In [6]:
with open("./results/word-embeddings-features/performance_RF_FastText.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FastTextFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()

    X = np.abs(np.average(X, axis=1))
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_FastText.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_FastText.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
Asthma
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
Macro F1 score: 0.7820041070041069 and Micro F1 Score 0.7923076923076923
CAD
CAD
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
Macro F1 score: 0.6926984126984126 and Micro F1 Score 0.7142857142857142
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 30

In [7]:
with open("./results/word-embeddings-features/performance_RF_USE.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = USEFeatureGeneration(train_preprocessed_df, morbidity).matrix_gen()

    X = np.abs(np.average(X, axis=1))
    #X = X.reshape(-1, 1)
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=min(1, len(X)-1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/word-embeddings-features/performance_RF_USE.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/word-embeddings-features/performance_RF_USE.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma


2023-05-04 19:30:31.342654: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Asthma
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(113, 300) (113,) (13, 300) (13,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
(114, 300) (114,) (12, 300) (12,)
Macro F1 score: 0.98375 and Micro F1 Score 0.9846153846153847
CAD
CAD
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
(63, 300) (63,) (7, 300) (7,)
Macro F1 score: 0.7606746031746031 and Micro F1 Score 0.7714285714285714
CHF
Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 300) (12,)
(104, 300) (104,) (12, 300) (12,)
(104, 300