In [2]:
import os

os.chdir('/Users/shreygupta/Documents/Classes/CS598DLH/')
os.getcwd()

'/Users/shreygupta/Documents/Classes/CS598DLH'

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from feature_generation import FeatureGeneration
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier
import csv
import collections
import numpy as np
import pandas as pd
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.filters import Filter
from weka.attribute_selection import ASEvaluation, AttributeSelection
from weka.classifiers import Classifier, Evaluation
from imblearn.over_sampling import SMOTE

# jvm.start()

In [6]:
class KNN:
    def __init__(self, x_train, y_train, x_test, y_test, n, k):
        self.knn = KNeighborsClassifier(n_neighbors=n)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.k = k

    def feature_selection_SelectKBest(self):
        k_best = SelectKBest(chi2, k=self.k)
        k_best.fit(self.x_train, self.y_train)
        self.x_train = k_best.transform(self.x_train)
        self.x_test = k_best.transform(self.x_test)
        
    def feature_selection_ExtraTreesClassifier(self):
        clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
        clf.fit(self.x_train, self.y_train)
        importances = clf.feature_importances_
        indices = np.argsort(importances)[::-1]
        self.x_train = self.x_train[:, indices[:self.k]]
        self.x_test = self.x_test[:, indices[:self.k]]
    
    def feature_selection_InfoGainAttributeEval(self, morbidity):
        loader = Loader(classname="weka.core.converters.ArffLoader")
        train_data = loader.load_file(f"./dataset/train/train_{morbidity}_tfidf.arff")
        train_data.class_is_last()

        # Initialize attribute selection
        eval = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval")
        search = AttributeSelection()
        search.evaluator = eval
        search.select_attributes(train_data)
        selected_attributes = search.selected_attributes
        filtered_attributes = np.delete(selected_attributes, [-1])
        # print("Selected attributes:", type(filtered_attributes), filtered_attributes.shape)

        # Apply selected attributes to the training and testing sets
        self.x_train = self.x_train[:, filtered_attributes]
        self.x_test = self.x_test[:, filtered_attributes]

    def train(self):
        self.knn.fit(self.x_train, self.y_train)

    def test_and_evaluate(self):
        y_pred = self.knn.predict(self.x_test)
        f1_macro = f1_score(self.y_test, y_pred, average='macro')
        f1_micro = f1_score(self.y_test, y_pred, average='micro')
        return f1_macro, f1_micro

In [7]:
morbidities = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'Obesity', 'OSA', 'PVD', 'Venous_Insufficiency']
column_headings = ["Morbidity Class", "KNN1_Macro F1", "KNN1_Micro F1", "KNN5_Macro F1", "KNN5_Micro F1"]

In [8]:
with open("./results/tf-idf/performance_KNN_AllFeatures.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2], column_headings[3], column_headings[4]])

all_f1_macro1_scores = []
all_f1_micro1_scores = []

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).tf_idf()

    if len(collections.Counter(list(Y)).keys()) >=2:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
    
        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list1 = []
        f1_micro_list1 = []
        f1_macro_list5 = []
        f1_micro_list5 = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training KNN using TF-IDF Representation
            knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1, 100)
            knn1_obj.train()

            f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

            f1_macro_list1.append(f1_macro1)
            f1_micro_list1.append(f1_micro1)

            knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5, 100)
            knn5_obj.train()

            f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

            f1_macro_list5.append(f1_macro5)
            f1_micro_list5.append(f1_micro5)

        f1_macro1 = np.mean(f1_macro_list1)
        f1_micro1 = np.mean(f1_micro_list1)
        f1_macro5 = np.mean(f1_macro_list5)
        f1_micro5 = np.mean(f1_micro_list5)
    
    else:
        f1_macro1 = 1
        f1_micro1 = 1
        f1_macro5 = 1
        f1_micro5 = 1
        
    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/tf-idf/performance_KNN_AllFeatures.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf/performance_KNN_AllFeatures.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
Asthma
For n=1, Macro F1 score: 0.666596314935899 and Micro F1 Score 0.7002574257425742
For n=5, Macro F1 score: 0.5658725563187745 and Micro F1 Score 0.6285643564356437
CAD
CAD
For n=1, Macro F1 score: 0.6783371879278528 and Micro F1 Score 0.6984615384615385
For n=5, Macro F1 score: 0.675525028273199 and Micro F1 Score 0.6907692307692308
CHF
For n=1, Macro F1 score: 1 and Micro F1 Score 1
For n=5, Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
For n=1, Macro F1 score: 0.6578570174318714 and Micro F1 Score 0.6891304347826088
For n=5, Macro F1 score: 0.5550205629698999 and Micro F1 Score 0.6173913043478261
Diabetes
Diabetes
For n=1, Macro F1 score: 0.610365511084751 and Micro F1 Score 0.6527689873417721
For n=5, Macro F1 score: 0.5111652043293369 and Micro F1 Score 0.5870727848101265
Gallstones
Gallstones
For n=1, Macro F1 score: 0.6212062738762162 and Micro F1 Score 0.6639487478159579
For n=5, Macro F1 score: 0.501312232317052 and Micro F1 Score 0.5868472141331781


In [9]:
with open("./results/tf-idf/performance_KNN_SelectKBest.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2], column_headings[3], column_headings[4]])

all_f1_macro1_scores = []
all_f1_micro1_scores = []

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).tf_idf()

    if len(collections.Counter(list(Y)).keys()) >=2:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled

        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list1 = []
        f1_micro_list1 = []
        f1_macro_list5 = []
        f1_micro_list5 = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training KNN using TF-IDF Representation
            knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1, 100)
            knn1_obj.feature_selection_SelectKBest()
            knn1_obj.train()

            f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

            f1_macro_list1.append(f1_macro1)
            f1_micro_list1.append(f1_micro1)

            knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5, 100)
            knn5_obj.feature_selection_SelectKBest()
            knn5_obj.train()

            f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

            f1_macro_list5.append(f1_macro5)
            f1_micro_list5.append(f1_micro5)

        f1_macro1 = np.mean(f1_macro_list1)
        f1_micro1 = np.mean(f1_micro_list1)
        f1_macro5 = np.mean(f1_macro_list5)
        f1_micro5 = np.mean(f1_micro_list5)
    else:
        f1_macro1 = 1
        f1_micro1 = 1
        f1_macro5 = 1
        f1_micro5 = 1
    
    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/tf-idf/performance_KNN_SelectKBest.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf/performance_KNN_SelectKBest.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
Asthma
For n=1, Macro F1 score: 0.890493280004576 and Micro F1 Score 0.8924851485148514
For n=5, Macro F1 score: 0.796041498811422 and Micro F1 Score 0.8047821782178218
CAD
CAD
For n=1, Macro F1 score: 0.7403619083195547 and Micro F1 Score 0.7461538461538462
For n=5, Macro F1 score: 0.7127469581507913 and Micro F1 Score 0.7215384615384617
CHF
For n=1, Macro F1 score: 1 and Micro F1 Score 1
For n=5, Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
For n=1, Macro F1 score: 0.8272582613330766 and Micro F1 Score 0.8315217391304348
For n=5, Macro F1 score: 0.7322497195056643 and Micro F1 Score 0.7467391304347826
Diabetes
Diabetes
For n=1, Macro F1 score: 0.7646317042099939 and Micro F1 Score 0.7740981012658229
For n=5, Macro F1 score: 0.7023427726053404 and Micro F1 Score 0.7210759493670886
Gallstones
Gallstones
For n=1, Macro F1 score: 0.7991998740756393 and Micro F1 Score 0.8091729761211415
For n=5, Macro F1 score: 0.7116011539184095 and Micro F1 Score 0.734090467870316

In [10]:
with open("./results/tf-idf/performance_KNN_ExtraTreesClassifier.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2], column_headings[3], column_headings[4]])

all_f1_macro1_scores = []
all_f1_micro1_scores = []

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).tf_idf()

    if len(collections.Counter(list(Y)).keys()) >=2:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled

        # add KFold cross validation
        skf = KFold(n_splits=10, shuffle=True, random_state=42)

        f1_macro_list1 = []
        f1_micro_list1 = []
        f1_macro_list5 = []
        f1_micro_list5 = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training KNN using TF-IDF Representation
            knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1, 100)
            knn1_obj.feature_selection_ExtraTreesClassifier()
            knn1_obj.train()

            f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

            f1_macro_list1.append(f1_macro1)
            f1_micro_list1.append(f1_micro1)

            knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5, 100)
            knn5_obj.feature_selection_ExtraTreesClassifier()
            knn5_obj.train()

            f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

            f1_macro_list5.append(f1_macro5)
            f1_micro_list5.append(f1_micro5)

        f1_macro1 = np.mean(f1_macro_list1)
        f1_micro1 = np.mean(f1_micro_list1)
        f1_macro5 = np.mean(f1_macro_list5)
        f1_micro5 = np.mean(f1_micro_list5)
    else:
        f1_macro1 = 1
        f1_micro1 = 1
        f1_macro5 = 1
        f1_micro5 = 1
        
    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/tf-idf/performance_KNN_ExtraTreesClassifier.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf/performance_KNN_ExtraTreesClassifier.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
Asthma
For n=1, Macro F1 score: 0.8988019940795289 and Micro F1 Score 0.9004455445544555
For n=5, Macro F1 score: 0.8412949340349766 and Micro F1 Score 0.8456831683168315
CAD
CAD
For n=1, Macro F1 score: 0.7523888550328193 and Micro F1 Score 0.76
For n=5, Macro F1 score: 0.7251629371948626 and Micro F1 Score 0.7353846153846153
CHF
For n=1, Macro F1 score: 1 and Micro F1 Score 1
For n=5, Macro F1 score: 1 and Micro F1 Score 1
Depression
Depression
For n=1, Macro F1 score: 0.8211046606291376 and Micro F1 Score 0.8271739130434783
For n=5, Macro F1 score: 0.730823992771722 and Micro F1 Score 0.7445652173913044
Diabetes
Diabetes
For n=1, Macro F1 score: 0.7841690085515248 and Micro F1 Score 0.7916930379746836
For n=5, Macro F1 score: 0.7283298308598628 and Micro F1 Score 0.7437816455696202
Gallstones
Gallstones
For n=1, Macro F1 score: 0.76207675286301 and Micro F1 Score 0.7776451174529218
For n=5, Macro F1 score: 0.6510881906267352 and Micro F1 Score 0.6876625897883906
GERD
GERD
For

In [11]:
with open("./results/tf-idf/performance_KNN_InfoGain.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([column_headings[0], column_headings[1], column_headings[2], column_headings[3], column_headings[4]])

all_f1_macro1_scores = []
all_f1_micro1_scores = []

all_f1_macro5_scores = []
all_f1_micro5_scores = []

for morbidity in morbidities:
    print(morbidity)
    train_preprocessed_df = pd.read_csv('./dataset/train/train_intuitive_preprocessed.csv')
    train_preprocessed_df = train_preprocessed_df[train_preprocessed_df[morbidity].isin([1.0, 0.0])]

    X, Y, words = FeatureGeneration(train_preprocessed_df, morbidity).tf_idf()

    
    # add KFold cross validation
    skf = KFold(n_splits=10, shuffle=True, random_state=42)
    if len(collections.Counter(list(Y)).keys()) < 2:
        f1_macro1 = 1
        f1_micro1 = 1
        f1_macro5 = 1
        f1_micro5 = 1
    else:
        print(morbidity)
        smote = SMOTE(random_state=42,k_neighbors=2)
        X_train_resampled, y_train_resampled = smote.fit_resample(X, Y)
        X, Y =  X_train_resampled, y_train_resampled
        f1_macro_list1 = []
        f1_micro_list1 = []
        f1_macro_list5 = []
        f1_micro_list5 = []
        for train_idx, val_idx in skf.split(X, Y):
            X_train_fold, Y_train_fold = X[train_idx], Y[train_idx]
            X_val_fold, Y_val_fold = X[val_idx], Y[val_idx]

            # Training KNN using TF-IDF Representation
            knn1_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 1, 100)
            knn1_obj.feature_selection_InfoGainAttributeEval(morbidity)
            knn1_obj.train()

            f1_macro1, f1_micro1 = knn1_obj.test_and_evaluate()

            f1_macro_list1.append(f1_macro1)
            f1_micro_list1.append(f1_micro1)

            knn5_obj = KNN(X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, 5, 100)
            knn5_obj.feature_selection_InfoGainAttributeEval(morbidity)
            knn5_obj.train()

            f1_macro5, f1_micro5 = knn5_obj.test_and_evaluate()

            f1_macro_list5.append(f1_macro5)
            f1_micro_list5.append(f1_micro5)

        f1_macro1 = np.mean(f1_macro_list1)
        f1_micro1 = np.mean(f1_micro_list1)
        f1_macro5 = np.mean(f1_macro_list5)
        f1_micro5 = np.mean(f1_micro_list5)

    print(f"For n=1, Macro F1 score: {f1_macro1} and Micro F1 Score {f1_micro1}")
    print(f"For n=5, Macro F1 score: {f1_macro5} and Micro F1 Score {f1_micro5}")

    row_heading = morbidity

    # data to be written to the CSV file
    data = [f1_macro1, f1_micro1, f1_macro5, f1_micro5]
    all_f1_macro1_scores.append(f1_macro1)
    all_f1_micro1_scores.append(f1_micro1)

    all_f1_macro5_scores.append(f1_macro5)
    all_f1_micro5_scores.append(f1_micro5)


    with open("./results/tf-idf/performance_KNN_InfoGain.csv", "a", newline="") as file:
        writer = csv.writer(file)
        row = [row_heading]
        row.extend(data)
        writer.writerow(row)

with open("./results/tf-idf/performance_KNN_InfoGain.csv", "a", newline="") as file:
    writer = csv.writer(file)
    row = ["Average"]
    row.extend([
        sum(all_f1_macro1_scores)/len(all_f1_macro1_scores),  sum(all_f1_micro1_scores)/len(all_f1_micro1_scores),
        sum(all_f1_macro5_scores)/len(all_f1_macro5_scores),  sum(all_f1_micro5_scores)/len(all_f1_micro5_scores) 
                ])
    writer.writerow(row)

Asthma
Asthma


AttributeError: 'NoneType' object has no attribute 'find_class'