In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [5]:
# Load cleaned data
beneficiary = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Beneficiary.csv')
claims = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Claims.csv')
inpatient = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Inpatient.csv')
outpatient = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Outpatient.csv')

In [6]:
# Merge inpatient and outpatient for broader coverage
claims_combined = pd.concat([inpatient, outpatient])
claims_with_labels = claims_combined.merge(claims, on='Provider', how='left')
data = claims_with_labels.merge(beneficiary, on='BeneID', how='left')

In [8]:
# Make sure output directory exists
os.makedirs('Visualization_Images', exist_ok=True)

In [9]:
# -------------------------------
# Visualization 1: Gender vs Fraud Status
# -------------------------------
plt.figure(figsize=(7, 5))
sns.countplot(data=data, x='Gender', hue='PotentialFraud', palette='Set1')
plt.title('Gender Distribution by Provider Fraud Label')
plt.xlabel('Gender (1 = Male, 2 = Female)')
plt.ylabel('Number of Beneficiaries')
plt.legend(title='Fraud Label')
plt.tight_layout()
plt.savefig('Visualization_Images/gender_vs_fraud.png')
plt.close()

In [10]:
# -------------------------------
# Visualization 2: Chronic Condition Correlation Heatmap
# -------------------------------
chronic_cols = [col for col in beneficiary.columns if col.startswith('ChronicCond_')]
chronic_data = beneficiary[chronic_cols].replace({2: 0})  # convert '2' (no) to 0

plt.figure(figsize=(10, 8))
sns.heatmap(chronic_data.corr(), annot=True, cmap='Blues', fmt='.2f')
plt.title('Correlation Between Chronic Conditions')
plt.tight_layout()
plt.savefig('Visualization_Images/chronic_conditions_correlation.png')
plt.close()