In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

Length of Stay Regression Visualizations

In [None]:
model_scores_transformed_df = pd.read_csv('saved_scores_to_plot/los_transformed_scores.csv')

log_transform_df = model_scores_transformed_df[model_scores_transformed_df['Transformation'] == 'Log Transform']
sqrt_df = model_scores_transformed_df[model_scores_transformed_df['Transformation'] == 'Square Root']
box_cox_df = model_scores_transformed_df[model_scores_transformed_df['Transformation'] == 'Box-Cox']

color_values = ['skyblue', '#5CED73']

fig, axes = plt.subplots(3, 3, figsize=(20, 20), sharey=False)
fig.subplots_adjust(hspace=0.95, wspace=0.50)
fig.suptitle('Model Performace by Transformation Type\n(Length of Stay)', fontsize=17, fontweight='bold')

sns.barplot(ax=axes[0, 0], x=log_transform_df['Model'], y=np.absolute(log_transform_df['avg_r2_score']), palette = color_values)
axes[0, 0].set_title('Log Transform Average R2 Score', fontweight='bold', fontsize=16)
axes[0, 0].set_ylabel('Average R2 Score', fontweight='bold', fontsize=16)
axes[0, 0].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[0, 0].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[0, 0].set_ylim(0, 0.5)
axes[0, 0].bar_label(axes[0, 0].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[0, 0].get_yticks()
axes[0, 0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[0, 1], x=sqrt_df['Model'], y=np.absolute(sqrt_df['avg_r2_score']), palette = color_values)
axes[0, 1].set_title('Square Root Transform Average R2 Score', fontweight='bold', fontsize=16)
axes[0, 1].set_ylabel('Average R2 Score', fontweight='bold', fontsize=16)
axes[0, 1].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[0, 1].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[0, 1].set_ylim(0, 0.5)
axes[0, 1].bar_label(axes[0, 1].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[0, 1].get_yticks()
axes[0, 1].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[0, 2], x=box_cox_df['Model'], y=np.absolute(box_cox_df['avg_r2_score']), palette = color_values)
axes[0, 2].set_title('Box Cox Transformation Average R2 Score', fontweight='bold', fontsize=16)
axes[0, 2].set_ylabel('Average R2 Score', fontweight='bold', fontsize=16)
axes[0, 2].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[0, 2].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[0, 2].set_ylim(0, 0.5)
axes[0, 2].bar_label(axes[0, 2].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[0, 2].get_yticks()
axes[0, 2].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[1, 0], x=log_transform_df['Model'], y=np.absolute(log_transform_df['avg_mae_score']), palette = color_values)
axes[1, 0].set_title('Log Transformation Average MAE Score', fontweight='bold', fontsize=16)
axes[1, 0].set_ylabel('Average MAE Score', fontweight='bold', fontsize=16)
axes[1, 0].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[1, 0].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[1, 0].set_ylim(0, 5)
axes[1, 0].bar_label(axes[1, 0].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[1, 0].get_yticks()
axes[1, 0].set_yticklabels([x for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[1, 1], x=sqrt_df['Model'], y=np.absolute(sqrt_df['avg_mae_score']), palette = color_values)
axes[1, 1].set_title('Square Root Transformation Average MAE Score', fontweight='bold', fontsize=16)
axes[1, 1].set_ylabel('Average MAE Score', fontweight='bold', fontsize=16)
axes[1, 1].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[1, 1].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[1, 1].set_ylim(0, 5)
axes[1, 1].bar_label(axes[1,1].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[1, 1].get_yticks()
axes[1, 1].set_yticklabels([x for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[1, 2], x=box_cox_df['Model'], y=np.absolute(box_cox_df['avg_mae_score']), palette = color_values)
axes[1, 2].set_title('Box Cox Transfomrmation Average MAE Score', fontweight='bold', fontsize=16)
axes[1, 2].set_ylabel('Average MAE Score', fontweight='bold', fontsize=16)
axes[1, 2].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[1, 2].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[1, 2].set_ylim(0, 5)
axes[1, 2].bar_label(axes[1, 2].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[1, 2].get_yticks()
axes[1, 2].set_yticklabels([x for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[2, 0], x=log_transform_df['Model'], y=np.absolute(log_transform_df['avg_mse_score']), palette = color_values)
axes[2, 0].set_title('Log Transformation Average MSE Score', fontweight='bold', fontsize=16)
axes[2, 0].set_ylabel('Average MSE Score', fontweight='bold', fontsize=16)
axes[2, 0].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[2, 0].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[2, 0].set_ylim(0, 100)
axes[2, 0].bar_label(axes[2, 0].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[2, 0].get_yticks()
axes[2, 0].set_yticklabels([x for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[2, 1], x=sqrt_df['Model'], y=np.absolute(sqrt_df['avg_mse_score']), palette = color_values)
axes[2, 1].set_title('Square Root Transformation Average MSE Score', fontweight='bold', fontsize=16)
axes[2, 1].set_ylabel('Average MSE Score', fontweight='bold', fontsize=16)
axes[2, 1].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[2, 1].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[2, 1].set_ylim(0, 100)
axes[2, 1].bar_label(axes[2, 1].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[2, 1].get_yticks()
axes[2, 1].set_yticklabels([x for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[2, 2], x=box_cox_df['Model'], y=np.absolute(box_cox_df['avg_mse_score']), palette = color_values)
axes[2, 2].set_title('Box Cox Transformation Average MSE Score', fontweight='bold', fontsize=16)
axes[2, 2].set_ylabel('Average MSE Score', fontweight='bold', fontsize=16)
axes[2, 2].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[2, 2].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[2, 2].set_ylim(0, 100)
axes[2, 2].bar_label(axes[2, 2].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[2, 2].get_yticks()
axes[2, 2].set_yticklabels([x for x in current_values_y], fontsize=16)

plt.tight_layout(rect = [0, 0, 1, 0.988])

In [None]:
scores_df = pd.read_csv('saved_scores_to_plot/los_regression_scores_df_final.csv')

#Change any negative r2 score to 0
scores_df['avg_r2_score'][scores_df['avg_r2_score'] < 0] = 0
scores_df['avg_mae_score'][scores_df['avg_mae_score'] < -200] = 0
scores_df['avg_mse_score'][scores_df['avg_mse_score'] < -200] = 0

#Plot distibution of length of stay target variable
fig, axes = plt.subplots(1, 3, figsize=(18, 8), sharey=False)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Evaluation Scores Length of Stay Prediction For Regression Models', fontsize=17, fontweight='bold')

sns.barplot(ax=axes[0], x=scores_df['Model'], y=scores_df['avg_r2_score'], palette=['#5CED73'])
axes[0].set_title('Average R2 Score', fontweight='bold', fontsize=16)
axes[0].set_ylabel('Average R2 Score', fontweight='bold', fontsize=14)
axes[0].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0].set_xticklabels(labels=['Linear\nRegression', 'Ridge\nRegeression', 'Lasso\nRegeression', 'DecisionTree\nRegression', 'GradientBoosted\nRegression', 'RandomForest\nRegression'], rotation=90, fontsize=14)
axes[0].bar_label(axes[0].containers[0], fmt='%.2f', padding=2, fontsize=14)
current_values_y = axes[0].get_yticks()
axes[0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)

sns.barplot(ax=axes[1],  x=scores_df['Model'], y=np.abs(scores_df['avg_mae_score']), palette=['skyblue'])
axes[1].set_title('Average MAE Score', fontweight='bold', fontsize=16)
axes[1].set_ylabel('Average MAE Score', fontweight='bold', fontsize=14)
axes[1].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[1].set_xticklabels(labels=['Linear\nRegression', 'Ridge\nRegeression', 'Lasso\nRegeression', 'DecisionTree\nRegression', 'GradientBoosted\nRegression', 'RandomForest\nRegression'], rotation=90, fontsize=14)
axes[1].bar_label(axes[1].containers[0], fmt='%.2f', padding=2, fontsize=14)
current_values_y = axes[1].get_yticks()
axes[1].set_yticklabels([x for x in current_values_y], fontsize=14)

sns.barplot(ax=axes[2],  x=scores_df['Model'], y=np.abs(scores_df['avg_mse_score']), palette=['orangered'])
axes[2].set_title('Average MSE Score', fontweight='bold', fontsize=16)
axes[2].set_ylabel('Average MSE Score', fontweight='bold', fontsize=14)
axes[2].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[2].set_xticklabels(labels=['Linear\nRegression', 'Ridge\nRegeression', 'Lasso\nRegeression', 'DecisionTree\nRegression', 'GradientBoosted\nRegression', 'RandomForest\nRegression'], rotation=90, fontsize=14)
axes[2].bar_label(axes[2].containers[0], fmt='%.2f', padding=2, fontsize=14)
current_values_y = axes[2].get_yticks()
axes[2].set_yticklabels([x for x in current_values_y], fontsize=14)

Total Cost Regression

In [None]:
scores_df = pd.read_csv('saved_scores_to_plot/cost_regression_scores_final.csv')

#Plot distibution of length of stay target variable
fig, axes = plt.subplots(1, 1, figsize=(18, 8), sharey=False)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Evaluation Scores Total Cost Prediction For Regression Models', fontsize=17, fontweight='bold')


barplot = sns.barplot(x=scores_df['Model'], y=scores_df['avg_r2_score'], palette=['#5CED73'])
plt.ylabel('Average R2 Score', fontweight='bold', fontsize=14)
plt.xlabel('Model Type', fontweight='bold', fontsize=14)
plt.xticks(ticks= [0 ,1, 2, 3], labels=['Ridge\nRegeression', 'Lasso\nRegeression', 'GradientBoosted\nRegression', 'RandomForest\nRegression'], size=14)
plt.bar_label(barplot.containers[0], fmt='%.2f', padding=2, fontsize=14)
plt.yticks(fontsize=14)

In [None]:
model_scores_transformed_df = pd.read_csv('saved_scores_to_plot/cost_transformed_scores.csv')

log_transform_df = model_scores_transformed_df[model_scores_transformed_df['Transformation'] == 'Log Transform']
sqrt_df = model_scores_transformed_df[model_scores_transformed_df['Transformation'] == 'Square Root']
box_cox_df = model_scores_transformed_df[model_scores_transformed_df['Transformation'] == 'Box-Cox']

color_values = ['skyblue', '#5CED73']

fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=False)
fig.subplots_adjust(hspace=0.95, wspace=0.50)
fig.suptitle('Model Performace by Transformation Type\n(Total Cost)', fontsize=17, fontweight='bold')

sns.barplot(ax=axes[0], x=log_transform_df['Model'], y=np.absolute(log_transform_df['avg_r2_score']), palette = color_values)
axes[0].set_title('Log Transform Average R2 Score', fontweight='bold', fontsize=16)
axes[0].set_ylabel('Average R2 Score', fontweight='bold', fontsize=16)
axes[0].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[0].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[0].set_ylim(0, 0.5)
axes[0].bar_label(axes[0].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[0].get_yticks()
axes[0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[1], x=sqrt_df['Model'], y=np.absolute(sqrt_df['avg_r2_score']), palette = color_values)
axes[1].set_title('Square Root Transform Average R2 Score', fontweight='bold', fontsize=16)
axes[1].set_ylabel('Average R2 Score', fontweight='bold', fontsize=16)
axes[1].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[1].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[1].set_ylim(0, 0.5)
axes[1].bar_label(axes[1].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[1].get_yticks()
axes[1].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=16)

sns.barplot(ax=axes[2], x=box_cox_df['Model'], y=np.absolute(box_cox_df['avg_r2_score']), palette = color_values)
axes[2].set_title('Box Cox Transformation Average R2 Score', fontweight='bold', fontsize=16)
axes[2].set_ylabel('Average R2 Score', fontweight='bold', fontsize=16)
axes[2].set_xlabel('Model Type', fontweight='bold', fontsize=16)
axes[2].set_xticklabels(labels=['Ridge\nRegeression', 'GradientBoosted\nRegression'], fontsize=16)
axes[2].set_ylim(0, 0.5)
axes[2].bar_label(axes[2].containers[0], fmt='%.2f', padding=2, fontsize=16)
current_values_y = axes[2].get_yticks()
axes[2].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=16)

plt.tight_layout(rect = [0, 0, 1, 0.988])



Length of Stay Classification

In [None]:
#Create visualization for each subpopulation for each model
import seaborn as sns
dict_to_replace = {'194.0': 'Heart Failure Patients', '140.0': 'COPD Patients', '750.0': 'Schizophrenia Patients', '463.0': 'Kidney/UTI Patients', '302.0': 'Knee Joint Replacement Patients'}

randomized_search_results = pd.read_csv('saved_scores_to_plot/randomized_search_results_final.csv')

fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharey=True)

fig.subplots_adjust(hspace=0.45, wspace=0.25)


fig.suptitle('F1 - Macro Scores For RandomForestClassifier vs. LogisticRegression\n(Length of Stay)', fontsize=17, weight='bold')

all_patients = randomized_search_results[randomized_search_results['subpopluation'] == 'all']
heart_patients = randomized_search_results[randomized_search_results['subpopluation'] == '194.0']
copd_patients = randomized_search_results[randomized_search_results['subpopluation'] == '140.0']
schizophrenia_patients = randomized_search_results[randomized_search_results['subpopluation'] == '750.0']
kidney_patients = randomized_search_results[randomized_search_results['subpopluation'] == '463.0']
knee_rep_patients = randomized_search_results[randomized_search_results['subpopluation'] == '302.0']

sns.barplot(ax=axes[0, 0], x=all_patients['model'], y=all_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[0, 0].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 0].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 0].bar_label(axes[0, 0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0, 0].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[0, 0].set_title('All Patients', fontweight='bold', fontsize=16)
current_values_y = axes[0, 0].get_yticks()
axes[0, 0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)

sns.barplot(ax=axes[0, 1], x=heart_patients['model'], y=heart_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[0, 1].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 1].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 1].bar_label(axes[0, 1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0,1].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[0, 1].set_title('Patients with DRG Code 194\n(Heart Failure)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[0, 2], x=copd_patients['model'], y=copd_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[0, 2].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 2].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 2].bar_label(axes[0, 2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0, 2].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[0, 2].set_title('Patients with DRG Code 140\n(COPD)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[1, 0], x=schizophrenia_patients['model'], y=schizophrenia_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[1, 0].set_xlabel('Model Type', fontweight='bold', fontsize=12)
axes[1, 0].set_ylabel('F1 Score', fontweight='bold', fontsize=12)
axes[1, 0].bar_label(axes[1, 0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 0].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'],  fontsize=14)
axes[1, 0].set_title('Patients with DRG Code 750\n(Schizophrenia)', fontweight='bold', fontsize=16)
current_values_y = axes[1, 0].get_yticks()
axes[1, 0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)

sns.barplot(ax=axes[1, 1], x=kidney_patients['model'], y=kidney_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[1, 1].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[1, 1].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[1, 1].bar_label(axes[1, 1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 1].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[1, 1].set_title('Patients with DRG Code 463\n(Kidney/UTI)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[1, 2], x=knee_rep_patients['model'], y=knee_rep_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[1, 2].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[1, 2].set_ylabel('F1 Score', fontsize=14, fontweight='bold')
axes[1, 2].bar_label(axes[1, 2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 2].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[1, 2].set_title('Patients with DRG Code 302\n(Knee Joint Replacement)', fontweight='bold', fontsize=16)

axes[1, 2].set_ylim(0, 0.5)

plt.tight_layout(rect = [0, 0, 1, 0.988])

In [None]:
#Build visualization comparing random forest model performance of all populations
final_results_df = pd.read_csv('saved_scores_to_plot/final_rf_model_results.csv')
final_results_df = final_results_df.replace({'Population': dict_to_replace})

final_results_df = final_results_df[0:6]

colors = ['orangered' if (x == 'All') else 'skyblue' for x in final_results_df['Population'].values]

fig, axes = plt.subplots(3, 1, figsize=(10, 12), sharey=False)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Comparison of Random Forest Model Performance Across Patient Populations\n(Length of Stay)', fontsize=17, fontweight='bold')

sns.barplot(ax=axes[0], x=final_results_df['Population'], y=final_results_df['F1_Macro'], palette=colors)
axes[0].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=12, fontweight='bold')
axes[0].bar_label(axes[0].containers[0], fmt='%.2f', padding=2, fontsize=12)
axes[0].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[0].set_title('F1 Macro Score Across Populations', fontweight='bold', fontsize=14)
axes[0].set_ylim(0, 1)
current_values_y = axes[0].get_yticks()
axes[0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=12)

sns.barplot(ax=axes[1], x=final_results_df['Population'], y=final_results_df['F1_Weighted'], palette=colors)
axes[1].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=12, fontweight='bold')
axes[1].bar_label(axes[1].containers[0], fmt='%.2f', fontsize=12)
axes[1].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[1].set_title('F1 Weighted Score Across Populations', fontweight='bold', fontsize=14)
axes[1].set_ylim(0, 1)
current_values_y = axes[1].get_yticks()
axes[1].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=12)

sns.barplot(ax=axes[2], x=final_results_df['Population'], y=final_results_df['Accuracy'], palette=colors)
axes[2].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')
axes[2].bar_label(axes[2].containers[0], fmt='%.2f', fontsize=12)
axes[2].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[2].set_title('Accuracy Score Across Populations', fontweight='bold', fontsize=14)
axes[2].set_ylim(0, 1)
current_values_y = axes[2].get_yticks()
axes[2].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=12)

plt.tight_layout(rect = [0, 0, 1, 0.988])

In [None]:
#Plot dummy classifiers vs rf model performance for each population
final_results_df = pd.read_csv('saved_scores_to_plot/final_rf_model_results.csv')
dict_to_replace = {194.0: 'Heart Failure Patients', 140.0: 'COPD Patients', 750.0: 'Schizophrenia Patients', 463.0: 'Kidney/UTI Patients', 302.0: 'Knee Joint Replacement Patients'}

fig, axes = plt.subplots(3, 1, figsize=(15, 18), sharey=False)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Scores of Trained Random Forest Model vs. Dummy Models By Subpopulation\n(Length of Stay)', fontsize=17, fontweight='bold')

sns.barplot(ax=axes[0], data=final_results_df, x="Population", y="F1_Macro", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[0].set_xlabel('Subpopulation', fontsize=12, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=12, fontweight='bold')
axes[0].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[0].set_title('F1 Macro Scores', fontsize=14, fontweight='bold')
axes[0].set_ylim(0, 0.5)
current_values_y = axes[0].get_yticks()
axes[0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[0].legend(fontsize=12)

sns.barplot(ax=axes[1], data=final_results_df, x="Population", y="F1_Weighted", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[1].set_xlabel('Subpopulation', fontsize=12, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=12, fontweight='bold')
axes[1].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[1].set_title('F1 Weighted Scores', fontsize=14, fontweight='bold')
current_values_y = axes[1].get_yticks()
axes[1].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[1].legend(fontsize=12)

sns.barplot(ax=axes[2], data=final_results_df, x="Population", y="Accuracy", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[2].set_xlabel('Subpopulation', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')
axes[2].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[2].set_title('Accuracy Scores', fontsize=14, fontweight='bold')
current_values_y = axes[2].get_yticks()
axes[2].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[2].legend(fontsize=12)

for ax1 in [axes[0], axes[1], axes[2]]:
    for c in ax1.containers:
        ax1.bar_label(c, fmt='%.2f', fontsize=14)

plt.tight_layout(rect = [0, 0, 1, 0.988])

Total Cost Classification Models

In [None]:
#Create visualization for each subpopulation for each model
import seaborn as sns
dict_to_replace = {'194.0': 'Heart Failure Patients', '140.0': 'COPD Patients', '750.0': 'Schizophrenia Patients', '463.0': 'Kidney/UTI Patients', '302.0': 'Knee Joint Replacement Patients'}

randomized_search_results = pd.read_csv('saved_scores_to_plot/Cost Best Params_various models.csv')

fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharey=True)

fig.subplots_adjust(hspace=0.45, wspace=0.25)


fig.suptitle('F1 - Macro Scores For HistGradientBoostedClassifier vs. RandomForestClassifier vs. LogisticRegression\n(Total Cost)', fontsize=17, weight='bold')

all_patients = randomized_search_results[randomized_search_results['subpopulation'] == 'all']
heart_patients = randomized_search_results[randomized_search_results['subpopulation'] == '194.0']
copd_patients = randomized_search_results[randomized_search_results['subpopulation'] == '140.0']
schizophrenia_patients = randomized_search_results[randomized_search_results['subpopulation'] == '750.0']
kidney_patients = randomized_search_results[randomized_search_results['subpopulation'] == '463.0']
knee_rep_patients = randomized_search_results[randomized_search_results['subpopulation'] == '302.0']

sns.barplot(ax=axes[0, 0], x=all_patients['model'], y=all_patients['best_scores'], palette=['orangered', '#5CED73', 'skyblue'])
axes[0, 0].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 0].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 0].bar_label(axes[0, 0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0, 0].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Logistic\nRegression', 'Random Forest\nClassifier'], fontsize=14)
axes[0, 0].set_title('All Patients', fontweight='bold', fontsize=16)
current_values_y = axes[0, 0].get_yticks()
axes[0, 0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)

sns.barplot(ax=axes[0, 1], x=heart_patients['model'], y=heart_patients['best_scores'], palette=['orangered', '#5CED73', 'skyblue'])
axes[0, 1].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 1].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 1].bar_label(axes[0, 1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0,1].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Logistic\nRegression', 'Random Forest\nClassifier'], fontsize=14)
axes[0, 1].set_title('Patients with DRG Code 194\n(Heart Failure)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[0, 2], x=copd_patients['model'], y=copd_patients['best_scores'], palette=['orangered', '#5CED73', 'skyblue'])
axes[0, 2].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 2].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 2].bar_label(axes[0, 2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0, 2].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Logistic\nRegression', 'Random Forest\nClassifier'], fontsize=14)
axes[0, 2].set_title('Patients with DRG Code 140\n(COPD)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[1, 0], x=schizophrenia_patients['model'], y=schizophrenia_patients['best_scores'], palette=['orangered', '#5CED73', 'skyblue'])
axes[1, 0].set_xlabel('Model Type', fontweight='bold', fontsize=12)
axes[1, 0].set_ylabel('F1 Score', fontweight='bold', fontsize=12)
axes[1, 0].bar_label(axes[1, 0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 0].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Logistic\nRegression', 'Random Forest\nClassifier'],  fontsize=14)
axes[1, 0].set_title('Patients with DRG Code 750\n(Schizophrenia)', fontweight='bold', fontsize=16)
current_values_y = axes[1, 0].get_yticks()
axes[1, 0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)

sns.barplot(ax=axes[1, 1], x=kidney_patients['model'], y=kidney_patients['best_scores'], palette=['orangered', '#5CED73', 'skyblue'])
axes[1, 1].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[1, 1].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[1, 1].bar_label(axes[1, 1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 1].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Logistic\nRegression', 'Random Forest\nClassifier'], fontsize=14)
axes[1, 1].set_title('Patients with DRG Code 463\n(Kidney/UTI)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[1, 2], x=knee_rep_patients['model'], y=knee_rep_patients['best_scores'], palette=['orangered', '#5CED73', 'skyblue'])
axes[1, 2].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[1, 2].set_ylabel('F1 Score', fontsize=14, fontweight='bold')
axes[1, 2].bar_label(axes[1, 2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 2].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Logistic\nRegression', 'Random Forest\nClassifier'], fontsize=14)
axes[1, 2].set_title('Patients with DRG Code 302\n(Knee Joint Replacement)', fontweight='bold', fontsize=16)

axes[1, 2].set_ylim(0, 0.5)

plt.tight_layout(rect = [0, 0, 1, 0.988])

In [None]:
#Build visualization comparing random forest model performance of all populations
final_results_df = pd.read_csv('saved_scores_to_plot/HGB_Subpop_Scores.csv')
final_results_df = final_results_df.replace({'Population': dict_to_replace})

colors = ['orangered' if (x == 'all') else 'skyblue' for x in final_results_df['Population'].values]

fig, axes = plt.subplots(3, 1, figsize=(10, 12), sharey=False)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Comparison of Random Forest Model Performance Across Patient Populations\n(Total Cost)', fontsize=17, fontweight='bold')

sns.barplot(ax=axes[0], x=final_results_df['Population'], y=final_results_df['F1_score_macro'], palette=colors)
axes[0].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=12, fontweight='bold')
axes[0].bar_label(axes[0].containers[0], fmt='%.2f', padding=2, fontsize=12)
axes[0].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[0].set_title('F1 Macro Score Across Populations', fontweight='bold', fontsize=14)
axes[0].set_ylim(0, 1)

sns.barplot(ax=axes[1], x=final_results_df['Population'], y=final_results_df['F1_score_weighted'], palette=colors)
axes[1].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=12, fontweight='bold')
axes[1].bar_label(axes[1].containers[0], fmt='%.2f', fontsize=12)
axes[1].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[1].set_title('F1 Weighted Score Across Populations', fontweight='bold', fontsize=14)
axes[1].set_ylim(0, 1)

sns.barplot(ax=axes[2], x=final_results_df['Population'], y=final_results_df['Accuracy Score'], palette=colors)
axes[2].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')
axes[2].bar_label(axes[2].containers[0], fmt='%.2f', fontsize=12)
axes[2].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[2].set_title('Accuracy Score Across Populations', fontweight='bold', fontsize=14)
axes[2].set_ylim(0, 1)

plt.tight_layout(rect = [0, 0, 1, 0.988])

In [None]:
#Plot dummy classifiers vs rf model performance for each population
final_results_df = pd.read_csv('saved_scores_to_plot/HGB_vs_Dummy.csv')
dict_to_replace = {194.0: 'Heart Failure Patients', 140.0: 'COPD Patients', 750.0: 'Schizophrenia Patients', 463.0: 'Kidney/UTI Patients', 302.0: 'Knee Joint Replacement Patients'}

fig, axes = plt.subplots(3, 1, figsize=(15, 15), sharey=False)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Scores of Trained Random Forest Model vs. Dummy Models By Subpopulation\n(Total Cost)', fontsize=17, fontweight='bold')

sns.barplot(ax=axes[0], data=final_results_df, x="Population", y="F1_score_macro", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[0].set_xlabel('Subpopulation', fontsize=14, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=14, fontweight='bold')
axes[0].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[0].set_title('F1 Macro Scores', fontsize=14, fontweight='bold')
axes[0].set_ylim(0, 0.5)
current_values_y = axes[0].get_yticks()
axes[0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[0].legend(fontsize=12)

sns.barplot(ax=axes[1], data=final_results_df, x="Population", y="F1_score_weighted", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[1].set_xlabel('Subpopulation', fontsize=14, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=14, fontweight='bold')
axes[1].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[1].set_title('F1 Weighted Scores', fontsize=14, fontweight='bold')
current_values_y = axes[1].get_yticks()
axes[1].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[1].legend(fontsize=12)

sns.barplot(ax=axes[2], data=final_results_df, x="Population", y="Accuracy Score", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[2].set_xlabel('Subpopulation', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=14, fontweight='bold')
axes[2].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[2].set_title('Accuracy Scores', fontsize=14, fontweight='bold')
current_values_y = axes[2].get_yticks()
axes[2].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[2].legend(fontsize=12)

for ax1 in [axes[0], axes[1], axes[2]]:
    for c in ax1.containers:
        ax1.bar_label(c, fmt='%.2f', fontsize=14)

plt.tight_layout(rect = [0, 0, 1, 0.988])

In [None]:
#Plot dummy classifiers vs rf model performance for each population
final_results_df = pd.read_csv('saved_scores_to_plot/los_oversample.csv')

fig, axes = plt.subplots(1, 3, figsize=(15, 8), sharey=True)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Scores of RandomForestClassifier Trained on Upsampled vs Original Data\n(Length of Stay)', fontsize=16, fontweight='bold')

final_results_upsampled = final_results_df[final_results_df['Population'] == 'all']
sns.barplot(ax=axes[0], data=final_results_df, x="Model", y="F1_score_macro", palette= ['orangered', 'skyblue'], ci=None)
axes[0].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=14, fontweight='bold')
axes[0].bar_label(axes[0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0].set_title('F1 Macro Scores', fontsize=14, fontweight='bold')
axes[0].set_xticklabels(labels=['Random Forest\nClassifier', 'Random Forest\nClassifier\nUpsampled'], fontsize=14)

sns.barplot(ax=axes[1], data=final_results_df, x="Model", y="F1_score_weighted", palette= ['orangered', 'skyblue'], ci=None)
axes[1].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=14, fontweight='bold')
axes[1].bar_label(axes[1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1].set_title('F1 Weighted Scores', fontsize=14, fontweight='bold')
axes[1].set_xticklabels(labels=['Random Forest\nClassifier', 'Random Forest\nClassifier\nUpsampled'], fontsize=14)


sns.barplot(ax=axes[2], data=final_results_df, x="Model", y="Accuracy Score", palette= ['orangered', 'skyblue'], ci=None)
axes[2].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=14, fontweight='bold')
axes[2].bar_label(axes[2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[2].set_title('Accuracy Scores', fontsize=14, fontweight='bold')
axes[2].set_xticklabels(labels=['Random Forest\nClassifier', 'Random Forest\nClassifier\nUpsampled'], fontsize=14)


In [None]:
#Plot dummy classifiers vs rf model performance for each population
final_results_df = pd.read_csv('saved_scores_to_plot/cost_oversample.csv')

fig, axes = plt.subplots(1, 3, figsize=(15, 8), sharey=True)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Scores of Hist Gradient Boosted Classifier Trained on Upsampled vs Original Data\n(Total Cost)', fontsize=16, fontweight='bold')

final_results_upsampled = final_results_df[final_results_df['Population'] == 'all']
sns.barplot(ax=axes[0], data=final_results_df, x="Model", y="F1_score_macro", palette= ['orangered', 'skyblue'], ci=None)
axes[0].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=14, fontweight='bold')
axes[0].bar_label(axes[0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0].set_title('F1 Macro Scores', fontsize=14, fontweight='bold')
axes[0].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Hist Gradient\nBoosted\nClassifier\nUpsampled'], fontsize=14)

sns.barplot(ax=axes[1], data=final_results_df, x="Model", y="F1_score_weighted", palette= ['orangered', 'skyblue'], ci=None)
axes[1].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=14, fontweight='bold')
axes[1].bar_label(axes[1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1].set_title('F1 Weighted Scores', fontsize=14, fontweight='bold')
axes[1].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Hist Gradient\nBoosted\nClassifier\nUpsampled'], fontsize=14)


sns.barplot(ax=axes[2], data=final_results_df, x="Model", y="Accuracy Score", palette= ['orangered', 'skyblue'], ci=None)
axes[2].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=14, fontweight='bold')
axes[2].bar_label(axes[2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[2].set_title('Accuracy Scores', fontsize=14, fontweight='bold')
axes[2].set_xticklabels(labels=['Hist Gradient\nBoosted\nClassifier', 'Hist Gradient\nBoosted\nClassifier\nUpsampled'], fontsize=14)
