# Evaluate different models for datasets

In [1]:
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt 
import seaborn as sns

sns.set_theme(style="whitegrid", palette=None, font_scale=1.2)

# 1 single dataset

In [None]:
'''
Define paths
'''
dataset = "12_uponAdmission"
vars = "boruta"
varPath = f"../../results/featureSelection/{dataset}/{vars}.txt"
resultsPath = f"../../results/prediction/featureSelection/{dataset}/{vars}/modelComparison"

''' 
Read in scores
'''
input = pd.DataFrame()
for file in os.listdir(resultsPath):
    if file.endswith("score.csv"):
        input = pd.concat([input,pd.read_csv(f"{resultsPath}/{file}")], axis=0, ignore_index=True)

''' 
Plot
'''
df_plot = input.drop(["fit_time", "score_time"], axis=1)
print(input[input["model"] == "RandomForest"].mean())
df_plot = df_plot.melt("model")

fig, ax = plt.subplots(figsize=(13,6))
sns.boxplot(data=df_plot, x="variable", y="value", hue="model")
ax.tick_params(axis='x', rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
ax.set_xlabel("")
ax.set_title(f"{dataset}")

''' 
Additional plot: feature importance
'''
df = pd.read_csv(f"{resultsPath}/RandomForest_featureImportance.csv")
fig, ax = plt.subplots(figsize=(10,5))
plt.bar(df["var"], df["fi"])
ax.tick_params(axis='x', rotation=90)
plt.title(f"{dataset}")

# Compare different datasets

In [None]:
'''
Define paths
'''
model = "RandomForest"

input = pd.DataFrame()
for dataset in ["12_uponAdmission", "12_hospitalization"]: # 12_beforeAdmission
    vars = "boruta"
    varPath = f"../../results/featureSelection/{dataset}/{vars}.txt"
    resultsPath = f"../../results/prediction/featureSelection/{dataset}/{vars}/modelComparison"

    ''' 
    Read in scores
    '''
    for file in os.listdir(resultsPath):
        if file.endswith("score.csv"):
            tmp = pd.read_csv(f"{resultsPath}/{file}")
            tmp["dataset"] = dataset.split("_")[1]
            input = pd.concat([input,tmp], axis=0, ignore_index=True)


''' 
Plot
'''
df_plot = input[input["model"] == model]
df_plot = df_plot.drop(["fit_time", "score_time", "model"], axis=1)
df_plot = df_plot.melt("dataset")

fig, ax = plt.subplots(figsize=(13,6))
sns.boxplot(data=df_plot, x="variable", y="value", hue="dataset")
ax.tick_params(axis='x', rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
ax.set_xlabel("")
ax.set_title(f"all datasets")

In [None]:
df_plot