In [27]:
from sklearn.ensemble import RandomForestClassifier
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.tf_idf_all_feature_matrix_gen import TFIDFFeatureGeneration
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier
import csv
import pandas as pd

In [28]:
class RandomForest:
    def __init__(self, x_train, y_train, x_test, y_test, k):
        self.rf = RandomForestClassifier(n_estimators=100, random_state=42)
        self.k = k
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)

    def feature_selection_SelectKBest(self, x_train, y_train, x_test):
        k_best = SelectKBest(chi2, k=self.k)
        k_best.fit(x_train, y_train)
        x_train = k_best.transform(x_train)
        x_test = k_best.transform(x_test)
        return x_train, x_test
        
    def train(self, x_train_val, y_train_val):
        self.rf.fit(x_train_val, y_train_val)

    def test_and_evaluate(self, x_test, y_test):
        y_pred = self.rf.predict(x_test)
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_micro = f1_score(y_test, y_pred, average='micro')
        return f1_macro, f1_micro

In [29]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']

In [31]:
column_headings = ["Morbidity Class", "RF_Macro F1", "RF_Micro F1"]

with open("./results/tf-idf-features/performance_RF_SelectKBest_test.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]
    test_preprocessed_df = pd.read_csv('./dataset/test/test_data_intuitive_preprocessed.csv')
    test_preprocessed_df = test_preprocessed_df[test_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X_train, Y_train, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()
    X_test, Y_test, words_test = TFIDFFeatureGeneration(test_preprocessed_df, morbidity).tf_idf_matrix_gen()

    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X_train, Y_train):
        X_train_fold, Y_train_fold = X_train[train_idx], Y_train[train_idx]
        X_val_fold, Y_val_fold = X_train[val_idx], Y_train[val_idx]

        # Feature Selection on training and testing data
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
        X_train_fold, X_test_fold = rf_obj.feature_selection_SelectKBest(X_train_fold, Y_train_fold, X_test)
        rf_obj.train(X_train_fold, Y_train_fold)
        f1_macro, f1_micro = rf_obj.test_and_evaluate(X_test_fold, Y_test)

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("./results/tf-idf-features/performance_RF_SelectKBest_test.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

Asthma
(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
(471, 600) (471,) Counter({0.0: 403, 1.0: 68})
(514, 600) (514,) (58, 600) (58,)
(514, 600) (514,) (58, 600) (58,)
(515, 600) (515,) (57, 600) (57,)
(515, 600) (515,) (57, 600) (57,)
(515, 600) (515,) (57, 600) (57,)
(515, 600) (515,) (57, 600) (57,)
(515, 600) (515,) (57, 600) (57,)
(515, 600) (515,) (57, 600) (57,)
(515, 600) (515,) (57, 600) (57,)
(515, 600) (515,) (57, 600) (57,)
Macro F1 score: 0.4859385710869308 and Micro F1 Score 0.6161358811040339
CAD
(548, 600) (548,) Counter({1.0: 325, 0.0: 223})
(457, 600) (457,) Counter({1.0: 272, 0.0: 185})
(493, 600) (493,) (55, 600) (55,)
(493, 600) (493,) (55, 600) (55,)
(493, 600) (493,) (55, 600) (55,)
(493, 600) (493,) (55, 600) (55,)
(493, 600) (493,) (55, 600) (55,)
(493, 600) (493,) (55, 600) (55,)
(493, 600) (493,) (55, 600) (55,)
(493, 600) (493,) (55, 600) (55,)
(494, 600) (494,) (54, 600) (54,)
(494, 600) (494,) (54, 600) (54,)
Macro F1 score: 0.541691035035734 and Micro F1