In [None]:
# Exploratory Data Analysis (EDA) for Hospital Readmission Dataset

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("../data/diabetes_data.csv")

# -------------------------------
# 1. Basic Info
# -------------------------------
print("Shape of dataset:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nData types:\n", df.dtypes)
print("\nFirst 5 rows:\n", df.head())

# -------------------------------
# 2. Target Variable Distribution
# -------------------------------
plt.figure(figsize=(6,4))
sns.countplot(x="readmitted", data=df)
plt.title("Distribution of Readmissions")
plt.xlabel("Readmitted (Yes/No)")
plt.ylabel("Count")
plt.show()

print(df['readmitted'].value_counts(normalize=True))

# -------------------------------
# 3. Missing Values
# -------------------------------
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("\nMissing Values:\n", missing)

plt.figure(figsize=(8,4))
sns.barplot(x=missing.index, y=missing.values)
plt.xticks(rotation=90)
plt.title("Missing Values per Column")
plt.show()

# -------------------------------
# 4. Correlation Heatmap
# -------------------------------
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (Numerical Features)")
plt.show()

# -------------------------------
# 5. Feature Distributions
# -------------------------------
num_cols = df.select_dtypes(include=['int64','float64']).columns
df[num_cols].hist(bins=30, figsize=(15,12))
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()
