In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [4]:
# Load cleaned datasets from Dataset_Cleaned folder
claims = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Claims.csv')
beneficiary = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Beneficiary.csv')
inpatient = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Inpatient.csv')
outpatient = pd.read_csv('Dataset_Cleaned/Cleaned_Train_Outpatient.csv')

In [6]:
# To save EDA images
output_folder = 'EDA_Images'
os.makedirs(output_folder, exist_ok=True)

In [8]:
# 1. Fraud Label Distribution (Claims Data)
plt.figure(figsize=(6, 4))
sns.countplot(data=claims, x='PotentialFraud', hue='PotentialFraud', palette='Set2', legend=False)
plt.title('Fraud Label Distribution (Claims Data)')
plt.xlabel('Potential Fraud')
plt.ylabel('Number of Providers')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, 'fraud_label_distribution.png'))
plt.close()

In [9]:
# 2. Chronic Conditions Among Patients (Beneficiary Data)
chronic_cols = [col for col in beneficiary.columns if col.startswith('ChronicCond_')]
chronic_sums = beneficiary[chronic_cols].replace({2: 0}).sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
chronic_sums.plot(kind='bar', color='skyblue')
plt.title('Prevalence of Chronic Conditions')
plt.ylabel('Number of Beneficiaries')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, 'chronic_conditions_distribution.png'))
plt.close()

In [10]:
# 3. Reimbursement Amount Distribution (Inpatient Data)
plt.figure(figsize=(8, 5))
sns.histplot(inpatient['InscClaimAmtReimbursed'], bins=50, kde=False, color='salmon')
plt.title('Distribution of Inpatient Reimbursement Amounts')
plt.xlabel('Reimbursement Amount')
plt.ylabel('Number of Claims')
plt.xlim(0, 50000)  # Adjust as needed for better scaling
plt.tight_layout()
plt.savefig(os.path.join(output_folder, 'inpatient_reimbursement_distribution.png'))
plt.close()

In [11]:
# 4. Claim Volume per Provider (Outpatient Data)
claim_counts = outpatient['Provider'].value_counts().head(20)

plt.figure(figsize=(10, 6))
claim_counts.plot(kind='bar', color='teal')
plt.title('Top 20 Providers by Outpatient Claim Volume')
plt.ylabel('Number of Claims')
plt.xlabel('Provider')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, 'outpatient_claims_per_provider.png'))
plt.close()