# EDA (Exploratory Data Analysis) with graphics

## Imports & settings

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams["figure.dpi"] = 120
plt.rcParams["savefig.dpi"] = 300


# Find root automatically
root = Path.cwd()
while root.name != "5CCSAMLF-CW1":
    if root.parent == root:
        raise RuntimeError("5CCSAMLF-CW1 folder not found. Make sure you are inside the 5CCSAMLF-CW1 folder.")
    root = root.parent


FIG_DIR = root / "report" / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = root / "data" / "CW1_train.csv"
TEST_PATH  = root / "data" / "CW1_test.csv"


Train shape: (10000, 31)
Test shape: (1000, 30)


## Load data

In [4]:
trn = pd.read_csv(TRAIN_PATH)
tst = pd.read_csv(TEST_PATH)

print("Train shape:", trn.shape)
print("Test shape: ", tst.shape)

display(trn.head())
display(tst.head())


FileNotFoundError: [Errno 2] No such file or directory: 'data/CW1_train.csv'

## Basic schema checks

In [None]:
target = "outcome"
categorical_cols = ["cut", "color", "clarity"]

assert target in trn.columns, "Target 'outcome' not found in train!"
assert target not in tst.columns, "Test should NOT contain 'outcome'!"

print("Train columns:", len(trn.columns))
print("Test columns: ", len(tst.columns))

print("\nTrain dtypes:")
display(trn.dtypes.value_counts())
display(trn.dtypes)

print("\nMissing values (train):")
display(trn.isna().sum().sort_values(ascending=False).head(20))

print("\nMissing values (test):")
display(tst.isna().sum().sort_values(ascending=False).head(20))


## Quick descriptive stats

In [None]:
display(trn.describe(include="all").T.head(30))


## Identify numeric columns

In [None]:
numeric_cols = [c for c in trn.columns if c not in categorical_cols + [target]]
num_train = trn[numeric_cols]
num_test  = tst[numeric_cols]

print("Numeric feature count:", len(numeric_cols))
print("Categorical features:", categorical_cols)


## Duplicate rows check

In [None]:
dup_count = trn.duplicated().sum()
print("Duplicate rows in train:", dup_count)


## Target (outcome) distribution

In [None]:
y = trn[target]

plt.figure(figsize=(7,4))
plt.hist(y, bins=40)
plt.title("Outcome distribution (train)")
plt.xlabel("outcome")
plt.ylabel("count")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "outcome_hist.png"))
plt.show()

print("Outcome summary:")
display(y.describe())
print("Skew:", y.skew())


## Categorical distributions + check levels in train vs test

In [None]:
for c in categorical_cols:
    plt.figure(figsize=(7,3))
    trn[c].value_counts().plot(kind="bar")
    plt.title(f"{c} frequency (train)")
    plt.xlabel(c)
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"{c}_freq_train.png"))
    plt.show()

    train_levels = set(trn[c].dropna().unique())
    test_levels  = set(tst[c].dropna().unique())
    only_in_train = sorted(list(train_levels - test_levels))
    only_in_test  = sorted(list(test_levels - train_levels))

    print(f"\n[{c}] levels: train={len(train_levels)} test={len(test_levels)}")
    if only_in_train:
        print("  Levels only in TRAIN:", only_in_train)
    if only_in_test:
        print("  Levels only in TEST :", only_in_test)


## Outcome by category (boxplots)

In [None]:
for c in categorical_cols:
    groups = [trn.loc[trn[c] == level, target].values for level in trn[c].dropna().unique()]
    labels = list(trn[c].dropna().unique())

    plt.figure(figsize=(8,4))
    plt.boxplot(groups, labels=labels, showfliers=False)
    plt.title(f"Outcome by {c} (train)")
    plt.xlabel(c)
    plt.ylabel("outcome")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"outcome_by_{c}_box.png"))
    plt.show()


## Numeric feature distributions (hist grid)

In [None]:
# (This can be a lot; we chunk to avoid huge figures.)
def hist_grid(df, cols, ncols=4, bins=30, title_prefix="", save_name=None):
    n = len(cols)
    nrows = int(np.ceil(n / ncols))
    plt.figure(figsize=(ncols*3.3, nrows*2.6))
    for i, col in enumerate(cols, 1):
        ax = plt.subplot(nrows, ncols, i)
        ax.hist(df[col].dropna(), bins=bins)
        ax.set_title(col, fontsize=9)
        ax.tick_params(labelsize=8)
    plt.suptitle(title_prefix, y=1.02)
    plt.tight_layout()
    if save_name:
        plt.savefig(os.path.join(FIG_DIR, save_name), bbox_inches="tight")
    plt.show()

# First 24 numeric cols
hist_grid(num_train, numeric_cols[:24], title_prefix="Numeric feature distributions (subset)", save_name="numeric_hist_subset1.png")

# Next 24 numeric cols (if exist)
if len(numeric_cols) > 24:
    hist_grid(num_train, numeric_cols[24:48], title_prefix="Numeric feature distributions (subset)", save_name="numeric_hist_subset2.png")


## Correlation with target (numeric only)

In [None]:
corr = trn[numeric_cols + [target]].corr(numeric_only=True)[target].drop(target).sort_values(key=lambda s: s.abs(), ascending=False)
display(corr.head(20))

plt.figure(figsize=(7,6))
corr.head(20).sort_values().plot(kind="barh")
plt.title("Top 20 numeric correlations with outcome (train)")
plt.xlabel("Pearson correlation")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "top_corr_with_outcome.png"))
plt.show()


## Correlation heatmap (top correlated numeric features)

In [None]:
topN = 18
top_feats = list(corr.index[:topN])
C = trn[top_feats + [target]].corr(numeric_only=True)

plt.figure(figsize=(8,6))
plt.imshow(C.values, aspect="auto")
plt.xticks(range(len(C.columns)), C.columns, rotation=90, fontsize=8)
plt.yticks(range(len(C.index)), C.index, fontsize=8)
plt.colorbar()
plt.title(f"Correlation heatmap (top {topN} + outcome)")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "corr_heatmap_top.png"))
plt.show()


## Scatter plots vs outcome for top correlated numeric features

In [None]:
top_scatter = top_feats[:6]

for col in top_scatter:
    plt.figure(figsize=(5,4))
    plt.scatter(trn[col], trn[target], s=8, alpha=0.35)
    plt.title(f"Outcome vs {col}")
    plt.xlabel(col)
    plt.ylabel("outcome")
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"scatter_outcome_{col}.png"))
    plt.show()


## Outlier check (IQR) for key numeric variables

In [None]:
key_cols = ["carat", "price", "x", "y", "z", "depth", "table"]
key_cols = [c for c in key_cols if c in numeric_cols]

def iqr_outlier_rate(s: pd.Series) -> float:
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    lo = q1 - 1.5 * iqr
    hi = q3 + 1.5 * iqr
    return ((s < lo) | (s > hi)).mean()

outlier_rates = {c: iqr_outlier_rate(trn[c].dropna()) for c in key_cols}
display(pd.Series(outlier_rates).sort_values(ascending=False))

# Boxplots (hide fliers for readability)
plt.figure(figsize=(9,4))
trn[key_cols].plot(kind="box", showfliers=False)
plt.title("Key numeric features (boxplot, fliers hidden)")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "key_numeric_boxplots.png"))
plt.show()


## Train vs Test distribution drift check (numeric)

In [None]:
# Compare mean/std quickly; big shifts might affect generalization.
train_stats = num_train.describe().T[["mean", "std", "min", "max"]]
test_stats  = num_test.describe().T[["mean", "std", "min", "max"]]

drift = train_stats.join(test_stats, lsuffix="_train", rsuffix="_test")
drift["mean_diff"] = drift["mean_test"] - drift["mean_train"]
drift["std_diff"]  = drift["std_test"] - drift["std_train"]

display(drift.sort_values("mean_diff", key=lambda s: s.abs(), ascending=False).head(20))


## Train vs Test: quick overlay hist for a few important features

In [None]:
compare_cols = ["carat", "price", "x", "y", "z"]
compare_cols = [c for c in compare_cols if c in numeric_cols]

for c in compare_cols:
    plt.figure(figsize=(6,4))
    plt.hist(num_train[c].dropna(), bins=40, alpha=0.5, label="train")
    plt.hist(num_test[c].dropna(),  bins=40, alpha=0.5, label="test")
    plt.title(f"Train vs Test distribution: {c}")
    plt.xlabel(c)
    plt.ylabel("count")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, f"train_vs_test_{c}.png"))
    plt.show()


## Report Helper

In [None]:
summary = {
    "train_rows": trn.shape[0],
    "train_cols": trn.shape[1],
    "test_rows": tst.shape[0],
    "test_cols": tst.shape[1],
    "missing_train_total": int(trn.isna().sum().sum()),
    "missing_test_total": int(tst.isna().sum().sum()),
    "dup_rows_train": int(trn.duplicated().sum()),
    "outcome_mean": float(trn[target].mean()),
    "outcome_std": float(trn[target].std()),
    "outcome_skew": float(trn[target].skew()),
}

print(summary)

print("\nTop 10 numeric correlations with outcome:")
display(corr.head(10))
