In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

In [9]:
# Load cleaned claims data
claims = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Claims.csv')

In [10]:
# Make sure output directory exists
os.makedirs('Visualization_Images', exist_ok=True)

In [11]:
# -------------------------------
# Visualization 1: Claim Volume vs Fraud Status (Simulated)
# -------------------------------
claims['claim_volume'] = np.random.randint(1, 500, size=len(claims))  # simulate volume

plt.figure(figsize=(8, 5))
sns.boxplot(data=claims, x='PotentialFraud', y='claim_volume', hue='PotentialFraud', palette='coolwarm', legend=False)
plt.title('Claim Volume Distribution by Fraud Status (Simulated)')
plt.xlabel('Potential Fraud')
plt.ylabel('Simulated Claim Volume')
plt.tight_layout()
plt.savefig('Visualization_Images/claim_volume_vs_fraud_status.png')
plt.close()

In [7]:
# -------------------------------
# Visualization 2: Fraud Count by Provider Claim Volume Group (Simulated Countplot)
# -------------------------------
# This groups providers into volume categories to observe fraud distribution.
claims['volume_group'] = pd.cut(claims['claim_volume'], bins=[0, 100, 250, 500], labels=['Low', 'Medium', 'High'])

plt.figure(figsize=(8, 5))
sns.countplot(data=claims, x='volume_group', hue='PotentialFraud', palette='pastel')
plt.title('Fraud Count by Provider Claim Volume Group')
plt.xlabel('Claim Volume Group')
plt.ylabel('Number of Providers')
plt.legend(title='Fraud')
plt.tight_layout()
plt.savefig('Visualization_Images/fraud_by_claim_volume_group.png')
plt.close()