In [20]:
from sklearn.ensemble import RandomForestClassifier
import os
os.chdir('/Users/renalkakhan/Documents/GitHub/CS598_DLH_Project/')
from dataset.preprocessing.data_preprocessing import DataPreprocessing
from dataset.preprocessing.tf_idf_all_feature_matrix_gen import TFIDFFeatureGeneration
from dataset.preprocessing.word2vec_embeddings_gen import Word2VecFeatureGeneration
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import csv

In [15]:
class RandomForest:
    def __init__(self, x_train, y_train, x_test, y_test):
        self.rf = RandomForestClassifier(n_estimators=100, random_state=42)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape)
        
    def train(self):
        self.rf.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.rf.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        #print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")
        return f1_macro, f1_micro

In [16]:
dataPreprocessing = DataPreprocessing('./dataset/train/train_data_textual.csv', 'Asthma')
train_preprocessed_df = dataPreprocessing.preprocess_data()

dataPreprocessing = DataPreprocessing('./dataset/test/test_data_textual.csv', 'Asthma')
test_preprocessed_df = dataPreprocessing.preprocess_data()

The shape of df is (76, 3)
The shape of df is (70, 3)


In [18]:
X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, 'Asthma').tf_idf_matrix_gen()

# add KFold cross validation
skf = KFold(n_splits=10, shuffle=True, random_state=42)

f1_macro_list = []
f1_micro_list = []
for train_idx, val_idx in skf.split(X, Y):
    X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
    X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

    # Training SVM using TF-IDF Representation
    rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
    rf_obj.train()

    f1_macro, f1_micro = rf_obj.test_and_evaluate()

    f1_macro_list.append(f1_macro)
    f1_micro_list.append(f1_micro)

f1_macro = np.mean(f1_macro_list)
f1_micro = np.mean(f1_micro_list)
print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")


['abd' 'abdoman' 'abdominal' 'about' 'above' 'acid' 'activity' 'acute'
 'add' 'addition' 'additional' 'admission' 'admit' 'advair' 'afebrile'
 'affect' 'after' 'air' 'albuterol' 'alcohol' 'alert' 'all' 'allergy'
 'also' 'although' 'am' 'an' 'and' 'anemia' 'antibiotic' 'anxiety' 'any'
 'apnea' 'appear' 'applicable' 'appointment' 'approximate' 'artery' 'as'
 'asa' 'aspiration' 'aspirin' 'associate' 'asthma' 'at' 'atenolol'
 'atorvastatin' 'atrial' 'atrovent' 'attend' 'avoid' 'aware' 'back'
 'baseline' 'batch' 'be' 'become' 'bed' 'before' 'bid' 'bilateral'
 'bilaterally' 'blocker' 'blood' 'both' 'bowel' 'bp' 'breath' 'brief' 'bs'
 'bun' 'but' 'by' 'cad' 'calcium' 'call' 'can' 'cancer' 'cardiac'
 'cardiology' 'cardiomyopathy' 'cardiovascular' 'care' 'cath'
 'catheterization' 'cause' 'cc' 'cell' 'center' 'change' 'check' 'chest'
 'chf' 'chill' 'chloride' 'cholesterol' 'chronic' 'clear' 'clinic' 'cm'
 'coat' 'code' 'colace' 'colitis' 'comment' 'complete' 'complication'
 'concern' 'condition'

In [19]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous Insufficiency']

In [21]:
column_headings = ["Morbidity Class", "RF_Macro F1", "RF_Micro F1"]

with open("performances.csv", "w", newline="") as file:
    writer = csv.writer(file)

    # write the SVD heading and the subheadings for Micro F1 and Macro F1
    writer.writerow([column_headings[0], column_headings[1], column_headings[2]])
    
for morbidity in morbidities:
    dataPreprocessing = DataPreprocessing('./dataset/train/train_data_textual.csv', morbidity)
    train_preprocessed_df = dataPreprocessing.preprocess_data()

    dataPreprocessing = DataPreprocessing('./dataset/test/test_data_textual.csv', morbidity)
    test_preprocessed_df = dataPreprocessing.preprocess_data()

    X, Y, words = TFIDFFeatureGeneration(train_preprocessed_df, 'Asthma').tf_idf_matrix_gen()

    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)

    f1_macro_list = []
    f1_micro_list = []
    for train_idx, val_idx in skf.split(X, Y):
        X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
        X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

        # Training SVM using TF-IDF Representation
        rf_obj = RandomForest(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold)
        rf_obj.train()

        f1_macro, f1_micro = rf_obj.test_and_evaluate()

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)

    f1_macro = np.mean(f1_macro_list)
    f1_micro = np.mean(f1_micro_list)
    print(f"Macro F1 score: {f1_macro} and Micro F1 Score {f1_micro}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro, f1_micro]

    with open("performances.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)  # skip the SVD value in the data list
        writer.writerow(row)

    break

    

The shape of df is (76, 3)
The shape of df is (70, 3)
['abd' 'abdoman' 'abdominal' 'about' 'above' 'acid' 'activity' 'acute'
 'add' 'addition' 'additional' 'admission' 'admit' 'advair' 'afebrile'
 'affect' 'after' 'air' 'albuterol' 'alcohol' 'alert' 'all' 'allergy'
 'also' 'although' 'am' 'an' 'and' 'anemia' 'antibiotic' 'anxiety' 'any'
 'apnea' 'appear' 'applicable' 'appointment' 'approximate' 'artery' 'as'
 'asa' 'aspiration' 'aspirin' 'associate' 'asthma' 'at' 'atenolol'
 'atorvastatin' 'atrial' 'atrovent' 'attend' 'avoid' 'aware' 'back'
 'baseline' 'batch' 'be' 'become' 'bed' 'before' 'bid' 'bilateral'
 'bilaterally' 'blocker' 'blood' 'both' 'bowel' 'bp' 'breath' 'brief' 'bs'
 'bun' 'but' 'by' 'cad' 'calcium' 'call' 'can' 'cancer' 'cardiac'
 'cardiology' 'cardiomyopathy' 'cardiovascular' 'care' 'cath'
 'catheterization' 'cause' 'cc' 'cell' 'center' 'change' 'check' 'chest'
 'chf' 'chill' 'chloride' 'cholesterol' 'chronic' 'clear' 'clinic' 'cm'
 'coat' 'code' 'colace' 'colitis' 'com