In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Load data
outpatient = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Outpatient.csv')
claims = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Claims.csv')

In [3]:
# Merge to include fraud labels
data = outpatient.merge(claims, on='Provider', how='left')

In [4]:
# Convert date column to datetime
data['ClaimStartDt'] = pd.to_datetime(data['ClaimStartDt'], errors='coerce')
data['Month'] = data['ClaimStartDt'].dt.month

In [5]:
# Make sure output directory exists
os.makedirs('Visualization_Images', exist_ok=True)

In [6]:
# -------------------------------
# Visualization 1: Monthly Outpatient Claims Volume
# -------------------------------
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Month', color='skyblue')
plt.title('Monthly Outpatient Claim Volume')
plt.xlabel('Month')
plt.ylabel('Number of Claims')
plt.tight_layout()
plt.savefig('Visualization_Images/monthly_outpatient_claims.png')
plt.close()

In [12]:
# -------------------------------
# Visualization 2: Reimbursement per Claim (Fraud vs. Non-Fraud)
# -------------------------------
plt.figure(figsize=(8, 5))
sns.boxplot(data=data, x='PotentialFraud', y='InscClaimAmtReimbursed', hue='PotentialFraud', palette='Set3', legend=False)
plt.yscale('log')
plt.title('Reimbursement per Claim by Fraud Status')
plt.xlabel('Potential Fraud')
plt.ylabel('Reimbursement Amount ($)')
plt.tight_layout()
plt.savefig('Visualization_Images/reimbursement_per_claim_by_fraud.png')
plt.close()