In [1]:

import os
os.chdir(os.path.join(os.getcwd(), '..','..', '..'))

In [2]:
from dataset.preprocessing.tf_idf_all_feature_matrix_gen import TFIDFFeatureGeneration
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.filters import Filter
from weka.attribute_selection import ASEvaluation, AttributeSelection
from weka.classifiers import Classifier, Evaluation
import csv
import pandas as pd
import collections
from imblearn.over_sampling import SMOTE
jvm.start()

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['/Users/ritwikdeshpande/DLH/Project/CS598_DLH_Project/venv/lib/python3.7/site-packages/javabridge/jars/rhino-1.7R4.jar', '/Users/ritwikdeshpande/DLH/Project/CS598_DLH_Project/venv/lib/python3.7/site-packages/javabridge/jars/runnablequeue.jar', '/Users/ritwikdeshpande/DLH/Project/CS598_DLH_Project/venv/lib/python3.7/site-packages/javabridge/jars/cpython.jar', '/Users/ritwikdeshpande/DLH/Project/CS598_DLH_Project/venv/lib/python3.7/site-packages/weka/lib/python-weka-wrapper.jar', '/Users/ritwikdeshpande/DLH/Project/CS598_DLH_Project/venv/lib/python3.7/site-packages/weka/lib/weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


In [1]:
# !ls -lrt

In [7]:
class RandomForest:
    def __init__(self, x_train, y_train, x_test, y_test, k):
        self.rf = RandomForestClassifier(n_estimators=100, random_state=42)
        self.k = k
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        # print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)

    def feature_selection_SelectKBest(self):
        k_best = SelectKBest(chi2, k=self.k)
        k_best.fit(self.x_train, self.y_train)
        self.x_train = k_best.transform(self.x_train)
        self.x_test = k_best.transform(self.x_test)

    def feature_selection_ExtraTreesClassifier(self):
        clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
        clf.fit(self.x_train, self.y_train)
        importances = clf.feature_importances_
        indices = np.argsort(importances)[::-1]
        self.x_train = self.x_train[:, indices[:self.k]]
        self.x_test = self.x_test[:, indices[:self.k]]

    def feature_selection_InfoGainAttributeEval(self, morbidity):
        loader = Loader(classname="weka.core.converters.ArffLoader")
        train_data = loader.load_file(f"./dataset/train/train_{morbidity}_tfidf.arff")
        train_data.class_is_last()

        # Initialize attribute selection
        eval = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval")
        search = AttributeSelection()
        search.evaluator = eval
        search.select_attributes(train_data)
        selected_attributes = search.selected_attributes
        filtered_attributes = np.delete(selected_attributes, [-1])
        # print("Selected attributes:", type(filtered_attributes), filtered_attributes.shape)

        # Apply selected attributes to the training and testing sets
        self.x_train = self.x_train[:, filtered_attributes]
        self.x_test = self.x_test[:, filtered_attributes]
        
    def train(self):
        self.rf.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.rf.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        #print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [5]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous-Insufficiency']

column_headings = ["Morbidity Class", "RF_Macro F1", "RF_Micro F1"]

#### All Features

In [4]:
with open("./results/tf-idf-features/performance_RF_AllFeatures.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()

    if len(collections.Counter(list(Y)).keys()) >=2:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled

        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 0)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
    
    else:
        f1_macro = 1
        f1_micro = 1

    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/tf-idf-features/performance_RF_AllFeatures.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf-features/performance_RF_AllFeatures.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
Macro F1 score: 0.6093595135837382 and Micro F1 Score 0.8985783424077434
CAD
(548, 600) (548,) Counter({1.0: 325, 0.0: 223})
Macro F1 score: 0.8976928072512701 and Micro F1 Score 0.9051178451178451
CHF
(243, 600) (243,) Counter({1.0: 243})
Macro F1 score: 1.0 and Micro F1 Score 1.0
Depression
(582, 600) (582,) Counter({0.0: 460, 1.0: 122})
Macro F1 score: 0.5212996220804968 and Micro F1 Score 0.8092343658679135
Diabetes
(567, 600) (567,) Counter({1.0: 396, 0.0: 171})
Macro F1 score: 0.8758962771927346 and Micro F1 Score 0.9014724310776941
Gallstones
(593, 600) (593,) Counter({0.0: 506, 1.0: 87})
Macro F1 score: 0.4599902140879871 and Micro F1 Score 0.8532485875706215
GERD
(487, 600) (487,) Counter({0.0: 372, 1.0: 115})
Macro F1 score: 0.43161818461332835 and Micro F1 Score 0.7617772108843537
Gout
(596, 600) (596,) Counter({0.0: 518, 1.0: 78})
Macro F1 score: 0.4647957966912661 and Micro F1 Score 0.8691242937853108
Hypercholesterolem

#### SelectKBest Feature Selection (K=100)

In [5]:
with open("./results/tf-idf-features/performance_RF_SelectKBest.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()

    if len(collections.Counter(list(Y)).keys()) >=2:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled

        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.feature_selection_SelectKBest()
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)

    else:
        f1_macro = 1
        f1_micro = 1

    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/tf-idf-features/performance_RF_SelectKBest.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf-features/performance_RF_SelectKBest.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
Macro F1 score: 0.9180571023713254 and Micro F1 Score 0.9684513006654569
CAD
(548, 600) (548,) Counter({1.0: 325, 0.0: 223})
Macro F1 score: 0.9258831565551706 and Micro F1 Score 0.9305387205387206
CHF
(243, 600) (243,) Counter({1.0: 243})
Macro F1 score: 1.0 and Micro F1 Score 1.0
Depression
(582, 600) (582,) Counter({0.0: 460, 1.0: 122})
Macro F1 score: 0.7507626943252863 and Micro F1 Score 0.8728521332554063
Diabetes
(567, 600) (567,) Counter({1.0: 396, 0.0: 171})
Macro F1 score: 0.9308732741707446 and Micro F1 Score 0.9436090225563909
Gallstones
(593, 600) (593,) Counter({0.0: 506, 1.0: 87})
Macro F1 score: 0.4594601152560015 and Micro F1 Score 0.8515536723163841
GERD
(487, 600) (487,) Counter({0.0: 372, 1.0: 115})
Macro F1 score: 0.4661286349771549 and Micro F1 Score 0.7537840136054421
Gout
(596, 600) (596,) Counter({0.0: 518, 1.0: 78})
Macro F1 score: 0.4633199964539981 and Micro F1 Score 0.8640677966101695
Hypercholesterolemi

#### ExtraTreesClassifier Feature Selection

In [8]:
with open("./results/tf-idf-features/performance_RF_ExtraTreesClassifier.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()

    if len(collections.Counter(list(Y)).keys()) >=2:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled

        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 100)
            rf_obj.feature_selection_ExtraTreesClassifier()
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)

    else:
        f1_macro = 1
        f1_micro = 1

    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/tf-idf-features/performance_RF_ExtraTreesClassifier.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf-features/performance_RF_ExtraTreesClassifier.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
Macro F1 score: 0.950948446167831 and Micro F1 Score 0.9807320024198427
CAD
(548, 600) (548,) Counter({1.0: 325, 0.0: 223})
Macro F1 score: 0.9276951579831151 and Micro F1 Score 0.9323905723905723
CHF
(243, 600) (243,) Counter({1.0: 243})
Macro F1 score: 1.0 and Micro F1 Score 1.0
Depression
(582, 600) (582,) Counter({0.0: 460, 1.0: 122})
Macro F1 score: 0.6453118185483524 and Micro F1 Score 0.8385447106954997
Diabetes
(567, 600) (567,) Counter({1.0: 396, 0.0: 171})
Macro F1 score: 0.9198170340417706 and Micro F1 Score 0.9348057644110277
Gallstones
(593, 600) (593,) Counter({0.0: 506, 1.0: 87})
Macro F1 score: 0.4599902140879871 and Micro F1 Score 0.8532485875706215
GERD
(487, 600) (487,) Counter({0.0: 372, 1.0: 115})
Macro F1 score: 0.4801140389804693 and Micro F1 Score 0.7659438775510204
Gout
(596, 600) (596,) Counter({0.0: 518, 1.0: 78})
Macro F1 score: 0.48376542086497115 and Micro F1 Score 0.8657344632768362
Hypercholesterolemi

#### Info Gain

In [9]:
column_headings = ["Morbidity Class", "RF_Macro F1", "RF_Micro F1"]

with open("./results/tf-idf-features/performance_RF_InfoGain.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the RF heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
all_f1_macro_scores = []
all_f1_micro_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_data_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, morbidity).tf_idf_matrix_gen()
    
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro = 1
        f1_micro = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list = []
        f1_micro_list = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training RF using TF-IDF Representation
            rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5)
            rf_obj.feature_selection_InfoGainAttributeEval(morbidity)
            rf_obj.train()

            f1_macro, f1_micro = rf_obj.test_and_evaluate()

            f1_macro_list.append(f1_macro)
            f1_micro_list.append(f1_micro)

        f1_macro = np.mean(f1_macro_list)
        f1_micro = np.mean(f1_micro_list)
        
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]
    all_f1_macro_scores.append(f1_macro)
    all_f1_micro_scores.append(f1_micro)

    with open("./results/tf-idf-features/performance_RF_InfoGain.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf-features/performance_RF_InfoGain.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Overall-Average"]
    row.extend([sum(all_f1_macro_scores)/len(all_f1_macro_scores),  sum(all_f1_micro_scores)/len(all_f1_micro_scores) ])
    writer.writerow(row)

Asthma
(572, 600) (572,) Counter({0.0: 502, 1.0: 70})
Macro F1 score: 0.9556492779857066 and Micro F1 Score 0.9824561403508772
CAD
(548, 600) (548,) Counter({1.0: 325, 0.0: 223})
Macro F1 score: 0.9125309344955019 and Micro F1 Score 0.9178451178451178
CHF
(243, 600) (243,) Counter({1.0: 243})
Macro F1 score: 1 and Micro F1 Score 1
Depression
(582, 600) (582,) Counter({0.0: 460, 1.0: 122})
Macro F1 score: 0.7647833713075274 and Micro F1 Score 0.8711864406779661
Diabetes
(567, 600) (567,) Counter({1.0: 396, 0.0: 171})
Macro F1 score: 0.9279935681614189 and Micro F1 Score 0.9418233082706765
Gallstones
(593, 600) (593,) Counter({0.0: 506, 1.0: 87})
Macro F1 score: 0.5097419964301964 and Micro F1 Score 0.8127683615819208
GERD
(487, 600) (487,) Counter({0.0: 372, 1.0: 115})
Macro F1 score: 0.5954178230300367 and Micro F1 Score 0.7494897959183673
Gout
(596, 600) (596,) Counter({0.0: 518, 1.0: 78})
Macro F1 score: 0.5018713921768855 and Micro F1 Score 0.8607344632768361
Hypercholesterolemia
(5