In [None]:
# scripts/eda_quicklook.py
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

RAW = Path("data/raw/diabetic_data.csv")
df = pd.read_csv(RAW, low_memory=False)

# replace UCI/Kaggle missing token
df = df.replace("?", np.nan)

print("shape:", df.shape)
print("\ncolumns:\n", df.columns.tolist())
print("\nmissing % (top 15):\n", (df.isna().mean().sort_values(ascending=False)*100).round(1).head(15))

print("\nreadmitted value counts:\n", df["readmitted"].value_counts(dropna=False))
print("\nexample rows:\n", df.sample(3, random_state=42))

# class balance (binary target)
y = (df["readmitted"] == "<30").astype(int)
print("\npositive rate (<30):", y.mean().round(4))

# numeric histograms
num_like = ["time_in_hospital","num_lab_procedures","num_procedures","num_medications",
            "number_outpatient","number_emergency","number_inpatient","number_diagnoses"]
df[num_like].hist(bins=30, figsize=(12,8))
plt.tight_layout(); plt.show()

# high-cardinality peek
for c in ["medical_specialty","diag_1","diag_2","diag_3"]:
    print(c, "nunique:", df[c].nunique(dropna=True), "top:", df[c].value_counts(dropna=False).head(5).to_dict())
