<h3>Length of Stay Model Building and Exploration</h3><br />
This notebook shows the process of building classification models for length of stay. It shows the performance of models with different bin sizes, different models, and upsampling techniques. This notebook also contains an evaluation of model performance across sub-populations.

In [None]:
#Import all needed models
import sys
import platform
plat = platform.system()
if plat == 'Windows':
    sys.path.insert(0, '..\src\helpers')
    sys.path.insert(0, '..\src\models')
elif plat =='Linux' or plat=='Darwin':
    sys.path.insert(0, '../src/helpers')
    sys.path.insert(0, '../src/models')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.metrics import log_loss, f1_score, classification_report, make_scorer, precision_score, recall_score, accuracy_score, confusion_matrix, plot_confusion_matrix
import seaborn as sns
from imblearn.over_sampling import SMOTE
from data_cleaners import *
from model_building_helpers import *


In [None]:
#Load all patient csv (csv containing data for all the indications (COPD, Heart Failure, Schizophrenia, Knee Replacement, Kidney/UTI))
data_file_path = '../data'
all_patient_df = load_data('All', data_file_path)

Use the following two cells to see how changing the bin definitions changes the accuracy and confusion matrix of the model.

In [None]:
#Bin the data by length of stay ranges
bins = [1, 5, 15, 30, 45, 60, 90, 120]
labels = ['1 to 5', '6 to 15', '16 to 30', '31 to 45', '46 to 60', '60 to 90', '90 to 120']

all_patient_df_bins = all_patient_df.copy()
all_patient_df_bins['Length of Stay Bin'] = pd.cut(x = all_patient_df_bins['Length of Stay'], bins = bins, labels = labels, include_lowest = True)

In [None]:
X, y, X_train, X_test, y_train, y_test = get_train_test_data(all_patient_df_bins)

model = RandomForestClassifier()
model.fit(X_train, np.ravel(y_train))
y_pred = model.predict(X_test)
bin1_accuracy = accuracy_score(y_test, y_pred)
bin1_f1_score_macro = f1_score(y_test, y_pred, average='macro')
bin1_f1_score_weighted = f1_score(y_test, y_pred, average='weighted')
cf_matrix = confusion_matrix(y_test, y_pred)
print("Accuracy = ", bin1_accuracy, "\nF1 Macro Score = ", bin1_f1_score_macro, "\nF1 Weighted Score = ", bin1_f1_score_weighted)

In [None]:
#Visualize the confusion matrix as a heart map
df_cm = pd.DataFrame(cf_matrix/np.sum(cf_matrix), index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (12, 12))
plt.title('Confusion Matrix For Random Forest Model', fontsize=17, weight='bold', pad=30)
sns.heatmap(df_cm, annot=True, fmt='.2%',  cmap='Blues')

In [None]:
#Get RandomForest best parameters and best score by sub-population
#Experiment model performance by changing the parameters tested and interations for the RandomizedSearch
rf_scores = []
iterations = 10 #We found the iterations did not affect F1 macro scores much, so we used the default of 10

for drg in ['all', 194.0, 140.0, 750.0, 463.0, 302.0]:
    subpop_df = load_data(drg, data_file_path)
    subpop_df['Length of Stay Bin'] = pd.cut(x = subpop_df['Length of Stay'], bins = bins, labels = labels, include_lowest = True)
    if len(subpop_df) > 30000:
        subpop_df = subpop_df.groupby('Length of Stay Bin', group_keys=False).apply(lambda x: x.sample(int(np.rint(30000*len(x)/len(subpop_df))))).sample(frac=1).reset_index(drop=True)
        
    X, y, X_train, X_test, y_train, y_test = get_train_test_data(subpop_df)

    random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 800, num = 15)],
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False],
               'class_weight': ['balanced', 'balanced_subsample']}
    
    best_params, best_score = get_best_rf_params(X_train, y_train, random_grid, iterations)
    rf_scores.append({drg:{
            'best_params': best_params,
            'best_score': best_score
            }
        }
    )

In [None]:
#Get LogisticRegression best parameters and best score by sub-population
#Experiment model performance by changing the parameters tested and interations for the RandomizedSearch
lr_scores = []
iterations = 10 #We found the iterations did not affect F1 macro scores much, so we used the default of 10

for drg in ['all', 194.0, 140.0, 750.0, 463.0, 302.0]:
    subpop_df = load_data(drg, data_file_path)
    subpop_df['Length of Stay Bin'] = pd.cut(x = subpop_df['Length of Stay'], bins = bins, labels = labels, include_lowest = True)
    if len(subpop_df) > 30000:
        subpop_df = subpop_df.groupby('Length of Stay Bin', group_keys=False).apply(lambda x: x.sample(int(np.rint(30000*len(x)/len(subpop_df))))).sample(frac=1).reset_index(drop=True)
        
    X, y, X_train, X_test, y_train, y_test = get_train_test_data(subpop_df)

    random_grid = {'C' : np.logspace(0, 4, num=10),
    'penalty' : ['l1', 'l2'],
    'solver' : ['liblinear'],
    'class_weight': ['balanced', None]}
    
    best_params, best_score = get_best_lr_params(X_train, y_train, random_grid, iterations)
    lr_scores.append({drg:{
            'best_params': best_params,
            'best_score': best_score
            }
        }
    )

In [None]:
scores_rf_df = pd.DataFrame(columns=['model', 'subpopulation', 'best_params', 'best_scores'])
for item in rf_scores:
    for key, item in item.items():
        scores_rf_df = scores_rf_df.append({'model': 'RandomForestClassifier', 'subpopulation': key, 'best_params': item['best_params'], 'best_scores': item['best_score']}, ignore_index=True)


scores_lr_df = pd.DataFrame(columns=['model', 'subpopulation', 'best_params', 'best_scores'])
for item in lr_scores:
    for key, item in item.items():
        scores_lr_df = scores_lr_df.append({'model': 'LogisticRegression', 'subpopulation': key, 'best_params': item['best_params'], 'best_scores': item['best_score']}, ignore_index=True)


randomized_search_results = pd.concat([scores_rf_df,scores_lr_df])

In [None]:
#Create visualization for each subpopulation for each model
import seaborn as sns

fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharey=True)

fig.subplots_adjust(hspace=0.45, wspace=0.25)


fig.suptitle('F1 - Macro Scores For RandomForestClassifier vs. LogisticRegression\n(Length of Stay)', fontsize=17, weight='bold')

all_patients = randomized_search_results[randomized_search_results['subpopulation'] == 'all']
heart_patients = randomized_search_results[randomized_search_results['subpopulation'] == 194.0]
copd_patients = randomized_search_results[randomized_search_results['subpopulation'] == 140.0]
schizophrenia_patients = randomized_search_results[randomized_search_results['subpopulation'] == 750.0]
kidney_patients = randomized_search_results[randomized_search_results['subpopulation'] == 463.0]
knee_rep_patients = randomized_search_results[randomized_search_results['subpopulation'] == 302.0]

sns.barplot(ax=axes[0, 0], x=all_patients['model'], y=all_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[0, 0].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 0].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 0].bar_label(axes[0, 0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0, 0].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[0, 0].set_title('All Patients', fontweight='bold', fontsize=16)
current_values_y = axes[0, 0].get_yticks()
axes[0, 0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)

sns.barplot(ax=axes[0, 1], x=heart_patients['model'], y=heart_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[0, 1].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 1].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 1].bar_label(axes[0, 1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0,1].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[0, 1].set_title('Patients with DRG Code 194\n(Heart Failure)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[0, 2], x=copd_patients['model'], y=copd_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[0, 2].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[0, 2].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[0, 2].bar_label(axes[0, 2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0, 2].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[0, 2].set_title('Patients with DRG Code 140\n(COPD)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[1, 0], x=schizophrenia_patients['model'], y=schizophrenia_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[1, 0].set_xlabel('Model Type', fontweight='bold', fontsize=12)
axes[1, 0].set_ylabel('F1 Score', fontweight='bold', fontsize=12)
axes[1, 0].bar_label(axes[1, 0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 0].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'],  fontsize=14)
axes[1, 0].set_title('Patients with DRG Code 750\n(Schizophrenia)', fontweight='bold', fontsize=16)
current_values_y = axes[1, 0].get_yticks()
axes[1, 0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)

sns.barplot(ax=axes[1, 1], x=kidney_patients['model'], y=kidney_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[1, 1].set_xlabel('Model Type', fontweight='bold', fontsize=14)
axes[1, 1].set_ylabel('F1 Score', fontweight='bold', fontsize=14)
axes[1, 1].bar_label(axes[1, 1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 1].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[1, 1].set_title('Patients with DRG Code 463\n(Kidney/UTI)', fontweight='bold', fontsize=16)

sns.barplot(ax=axes[1, 2], x=knee_rep_patients['model'], y=knee_rep_patients['best_scores'], palette=['#5CED73', 'skyblue'])
axes[1, 2].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[1, 2].set_ylabel('F1 Score', fontsize=14, fontweight='bold')
axes[1, 2].bar_label(axes[1, 2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1, 2].set_xticklabels(labels=['Random Forest\nClassifier', 'Logistic\nRegression'], fontsize=14)
axes[1, 2].set_title('Patients with DRG Code 302\n(Knee Joint Replacement)', fontweight='bold', fontsize=16)

axes[1, 2].set_ylim(0, 0.5)

plt.tight_layout(rect = [0, 0, 1, 0.988])

It looks like for most models, the RandomForestClassifier does as good or better than the LogisticRegression model, with the exception of the models trained on Knee Replacement patients. Let's train each subpopulation on their best parameters for RandomForestClassifier model.

In [None]:
final_model_scores = []

for drg in ['all', 194.0, 140.0, 750.0, 463.0, 302.0]:
    subpop_df = load_data(drg, data_file_path)
    subpop_df['Length of Stay Bin'] = pd.cut(x = subpop_df['Length of Stay'], bins = bins, labels = labels, include_lowest = True)
      
    X, y, X_train, X_test, y_train, y_test = get_train_test_data(subpop_df)

    best_params = scores_rf_df.loc[(scores_rf_df['model']=='RandomForestClassifier') & (scores_rf_df['subpopulation']==drg)]['best_params'].values[0]
    model_scores = get_model_scores(best_params, X_train, X_test, y_train, y_test)
    final_model_scores.append({
        'Model': 'RandomForestClassifier',
        'Population': drg,
        'Accuracy Score': model_scores[0],
        'F1_score_macro': model_scores[1],
        'F1_score_weighted': model_scores[2]
    })

final_results_df = pd.DataFrame(final_model_scores)

In [None]:
#Visualize the results
#Build visualization comparing random forest model performance of all populations
colors = ['orangered' if (x == 'all') else 'skyblue' for x in final_results_df['Population'].values]

fig, axes = plt.subplots(3, 1, figsize=(15, 16), sharey=True)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Comparison of Random Forest Model Performance Across Patient Populations\n(Length of Stay)', fontsize=17, fontweight='bold')

sns.barplot(ax=axes[0], x=final_results_df['Population'], y=final_results_df['F1_score_macro'], palette=colors)
axes[0].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=12, fontweight='bold')
axes[0].bar_label(axes[0].containers[0], fmt='%.2f', padding=2, fontsize=12)
axes[0].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[0].set_title('F1 Macro Score Across Populations', fontweight='bold', fontsize=14)
axes[0].set_ylim(0, 1)
current_values_y = axes[0].get_yticks()
axes[0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=12)

sns.barplot(ax=axes[1], x=final_results_df['Population'], y=final_results_df['F1_score_weighted'], palette=colors)
axes[1].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=12, fontweight='bold')
axes[1].bar_label(axes[1].containers[0], fmt='%.2f', fontsize=12)
axes[1].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[1].set_title('F1 Weighted Score Across Populations', fontweight='bold', fontsize=14)
axes[1].set_ylim(0, 1)
current_values_y = axes[1].get_yticks()
axes[1].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=12)

sns.barplot(ax=axes[2], x=final_results_df['Population'], y=final_results_df['Accuracy Score'], palette=colors)
axes[2].set_xlabel('Subpopulation (DRG Code)', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')
axes[2].bar_label(axes[2].containers[0], fmt='%.2f', fontsize=12)
axes[2].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=12)
axes[2].set_title('Accuracy Score Across Populations', fontweight='bold', fontsize=14)
axes[2].set_ylim(0, 1)
current_values_y = axes[2].get_yticks()
axes[2].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=12)

plt.tight_layout(rect = [0, 0, 1, 0.988])

From the visualization above, it can be seen that for F1 macro scores the model built on all conditions of interest outperforms models trained only on patients with a specific DRG code with the exception of Knee replacement patients.
<br /><br />
Let's also compare performance with a dummy classifier which predicts the majority class every time and a dummy classifier which selects a class at random.

In [None]:
#Get dummy model performance on all the population datasets
final_model_scores = []

#all population
for drg in ['all', 194.0, 140.0, 750.0, 463.0, 302.0]:
    print(drg)
    subpop_df = load_data(drg, data_file_path)
    subpop_df['Length of Stay Bin'] = pd.cut(x = subpop_df['Length of Stay'], bins = bins, labels = labels, include_lowest = True)
      
    X, y, X_train, X_test, y_train, y_test = get_train_test_data(subpop_df)
    
    most_freq_dummy_scores = get_dummy_scores("most_frequent", X_train, X_test, y_train, y_test)

    final_model_scores.append({
        'Population': drg,
        'Model': 'DummyClassifier - Most Frequent',
        'Accuracy Score': most_freq_dummy_scores[0],
        'F1_score_macro': most_freq_dummy_scores[1],
        'F1_score_weighted': most_freq_dummy_scores[2]
    })

    random_dummy_scores = get_dummy_scores("uniform", X_train, X_test, y_train, y_test)

    final_model_scores.append({
        'Population': drg,
        'Model': 'DummyClassifier - Random',
        'Accuracy Score': random_dummy_scores[0],
        'F1_score_macro': random_dummy_scores[1],
        'F1_score_weighted': random_dummy_scores[2]
    })

In [None]:
final_scores_df = pd.DataFrame(final_model_scores)
final_results_with_dummy_df = pd.concat([final_results_df, final_scores_df], ignore_index=True)

In [None]:
#Plot dummy classifiers vs rf model performance for each population
fig, axes = plt.subplots(3, 1, figsize=(15, 18), sharey=True)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Scores of Trained Random Forest Model vs. Dummy Models By Subpopulation', fontsize=16)

sns.barplot(ax=axes[0], data=final_results_with_dummy_df, x="Population", y="F1_Macro", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[0].set_xlabel('Subpopulation', fontsize=12, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=12, fontweight='bold')
axes[0].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[0].set_title('F1 Macro Scores', fontsize=14, fontweight='bold')
axes[0].set_ylim(0, 0.5)
current_values_y = axes[0].get_yticks()
axes[0].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[0].legend(fontsize=12)

sns.barplot(ax=axes[1], data=final_results_with_dummy_df, x="Population", y="F1_score_weighted", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[1].set_xlabel('Subpopulation', fontsize=12, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=12, fontweight='bold')
axes[1].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[1].set_title('F1 Weighted Scores', fontsize=14, fontweight='bold')
axes[1].set_ylim(0, 0.5)
current_values_y = axes[1].get_yticks()
axes[1].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[1].legend(fontsize=12)

sns.barplot(ax=axes[2], data=final_results_with_dummy_df, x="Population", y="Accuracy Score", hue="Model",palette= ['orangered', 'skyblue', '#5CED73'])
axes[2].set_xlabel('Subpopulation', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')
axes[2].set_xticklabels(labels=['All', 'Heart Failure\n(DRG 194)', 'COPD\n(DRG 140)', 'Schizophrenia\n(DRG 750)', 'Kidney\\UTI\n(DRG 463)', 'Knee Joint\nReplacement\n(DRG 302)'], fontsize=14)
axes[2].set_title('Accuracy Scores', fontsize=14, fontweight='bold')
axes[2].set_ylim(0, 0.5)
current_values_y = axes[2].get_yticks()
axes[2].set_yticklabels([round(x, 2) for x in current_values_y], fontsize=14)
axes[2].legend(fontsize=12)

for ax1 in [axes[0], axes[1], axes[2]]:
    for c in ax1.containers:
        # set the bar label
        ax1.bar_label(c, fmt='%.2f', fontsize=14)

plt.tight_layout(rect = [0, 0, 1, 0.988])

The models do outperform the dummy models, especially when comparing the F1 Macro score. However, overall we can see that the F1 macro scores are still fairly low for the trained models. This is due to the class imbalance we see in the dataset. About 65% of patients stay in the hospital 1 - 5 days. This is about the same as our accuracy and weighted F1 scores.
<br /><br />
As a next step, let's see if upsampling our tarining data for the 'All patients' model makes a difference to the F1 macro score

In [None]:
oversample_scores = []

oversample = SMOTE()

best_params = scores_rf_df.loc[(scores_rf_df['model']=='RandomForestClassifier') & (scores_rf_df['subpopulation']=='all')]['best_params'].values[0]

subpop_df = load_data('all', data_file_path)
subpop_df['Length of Stay Bin'] = pd.cut(x = subpop_df['Length of Stay'], bins = bins, labels = labels, include_lowest = True)
      
X, y, X_train, X_test, y_train, y_test = get_train_test_data(subpop_df)

X_train_resample, y_train_resample = oversample.fit_resample(X_train, y_train)

oversampled_model = train_model(best_params, X_train_resample, y_train_resample)

model_scores = calculate_scores(oversampled_model, X_test, y_test)

oversample_scores.append({
    'Population': 'all',
    'Model': 'RandomForestClassifier - Upsampled',
    'Accuracy Score': model_scores[0],
    'F1_score_macro': model_scores[1],
    'F1_score_weighted': model_scores[2]
})

final_results_df = final_results_df.append(oversample_scores)
final_results_df

In [None]:
final_results_df.to_csv('../src/visualizations/saved_scores_to_plot/los_oversample.csv')

In [None]:
#Plot dummy classifiers vs rf model performance for each population
fig, axes = plt.subplots(1, 3, figsize=(15, 8), sharey=True)
fig.subplots_adjust(hspace=0.45, wspace=0.25)
fig.suptitle('Scores of RandomForestClassifier Trained on Upsampled vs Original Data\n(Length of Stay)', fontsize=16, fontweight='bold')

final_results_upsampled = final_results_df[final_results_df['Population'] == 'all']
sns.barplot(ax=axes[0], data=final_results_df, x="Model", y="F1_score_macro", palette= ['orangered', 'skyblue'], ci=None)
axes[0].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[0].set_ylabel('F1 Macro Score', fontsize=14, fontweight='bold')
axes[0].bar_label(axes[0].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[0].set_title('F1 Macro Scores', fontsize=14, fontweight='bold')
axes[0].set_xticklabels(labels=['Random Forest\nClassifier', 'Random Forest\nClassifier\nUpsampled'], fontsize=14)

sns.barplot(ax=axes[1], data=final_results_df, x="Model", y="F1_score_weighted", palette= ['orangered', 'skyblue'], ci=None)
axes[1].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[1].set_ylabel('F1 Weighted Score', fontsize=14, fontweight='bold')
axes[1].bar_label(axes[1].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[1].set_title('F1 Weighted Scores', fontsize=14, fontweight='bold')
axes[1].set_xticklabels(labels=['Random Forest\nClassifier', 'Random Forest\nClassifier\nUpsampled'], fontsize=14)


sns.barplot(ax=axes[2], data=final_results_df, x="Model", y="Accuracy Score", palette= ['orangered', 'skyblue'], ci=None)
axes[2].set_xlabel('Model Type', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Accuracy Score', fontsize=14, fontweight='bold')
axes[2].bar_label(axes[2].containers[0], fmt='%.2f', padding=2, fontsize=14)
axes[2].set_title('Accuracy Scores', fontsize=14, fontweight='bold')
axes[2].set_xticklabels(labels=['Random Forest\nClassifier', 'Random Forest\nClassifier\nUpsampled'], fontsize=14)
#plt.savefig('../src/visualizations/upsampled_vs_original.png')

In [None]:
data={'feature_names':oversampled_model.feature_names_in_,'feature_importance':oversampled_model.feature_importances_}
features_importance_df = pd.DataFrame(data)
features_importance_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
features_importance_df.head()
feature_importance_to_10_df = features_importance_df[0:10]

In [None]:
plt.figure(figsize=(15,15))

sns_barplot = sns.barplot(x=feature_importance_to_10_df['feature_importance'], y=feature_importance_to_10_df['feature_names'], palette = ['skyblue'])
plt.xlabel('Feature Importance', fontsize=14, fontweight='bold')
plt.ylabel('Feature Names', fontsize=14, fontweight='bold')
plt.title('Top 10 Feature Importances For Gradient Boosted Regressor', fontsize=17, fontweight='bold')
plt.tick_params(axis='both', which='major', labelsize=12)
plt.bar_label(sns_barplot.containers[0], fmt='%.3f', padding=2, fontsize=12)