In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Load data
inpatient = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Inpatient.csv')
claims = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Claims.csv')

In [3]:
# Merge with fraud labels
data = inpatient.merge(claims, on='Provider', how='left')

In [5]:
# Convert dates to datetime and compute stay length
data['ClaimStartDt'] = pd.to_datetime(data['ClaimStartDt'], errors='coerce')
data['ClaimEndDt'] = pd.to_datetime(data['ClaimEndDt'], errors='coerce')
data['LengthOfStay'] = (data['ClaimEndDt'] - data['ClaimStartDt']).dt.days

# Drop invalid or negative stays
data = data[(data['LengthOfStay'] >= 0)]

In [6]:
# Make sure output directory exists
os.makedirs('Visualization_Images', exist_ok=True)

In [7]:
# -------------------------------
# Visualization 1: Length of Stay vs Reimbursement (Boxplot)
# -------------------------------
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='LengthOfStay', y='InscClaimAmtReimbursed', hue='PotentialFraud', alpha=0.6)
plt.title('Length of Stay vs Reimbursement Amount')
plt.xlabel('Length of Stay (Days)')
plt.ylabel('Reimbursement ($)')
plt.legend(title='Fraud')
plt.tight_layout()
plt.savefig('Visualization_Images/length_of_stay_vs_reimbursement.png')
plt.close()

In [9]:
# -------------------------------
# Visualization 2: Top 10 Inpatient Reimbursed Providers
# -------------------------------
top_providers = data.groupby('Provider')['InscClaimAmtReimbursed'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_providers.values, y=top_providers.index)
plt.title('Top 10 Providers by Inpatient Reimbursement')
plt.xlabel('Total Reimbursement ($)')
plt.ylabel('Provider')
plt.tight_layout()
plt.savefig('Visualization_Images/top_inpatient_providers.png')
plt.close()