In [2]:
# ================================
# Notebook 3: Data Preprocessing (encoding, no scaling)
# ================================

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

# ---- Config ----
DATA_PATH = "/Users/mr.engineer/Desktop/Code/DSPROJECT/CREDITRISK/DATA/credit_risk_dataset.csv"
TARGET = "loan_status"   # 1 = default, 0 = repaid
TEST_SIZE = 0.20
VAL_SIZE  = 0.20          # part of remaining after test split
RANDOM_STATE = 42

OUT_DIR = Path("artifacts_preprocessing")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- 1) Load ----------
df = pd.read_csv(DATA_PATH, low_memory=False)
print("Loaded:", DATA_PATH, "| shape:", df.shape)
assert TARGET in df.columns, f"{TARGET} not found!"

# Keep original order (just in case)
orig_cols = df.columns.tolist()

# ---------- 2) Clean column names (good feature names) ----------
def clean_name(s: str) -> str:
    s = s.strip().lower()
    s = s.replace("%", "pct").replace("/", "_").replace("-", "_").replace(" ", "_")
    # keep only a-z, 0-9 and underscores
    return "".join(ch if (ch.isalnum() or ch == "_") else "_" for ch in s)

df.columns = [clean_name(c) for c in df.columns]
if TARGET not in df.columns:
    # handle if target got renamed by cleaning
    # assume original target was loan_status -> clean is same
    pass

# ---------- 3) Basic target sanity ----------
df[TARGET] = df[TARGET].astype(int)

# ---------- 4) Outlier handling (clip, but KEEP rows) ----------
def iqr_clip(series: pd.Series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return series.clip(lower, upper)

# Obvious impossible caps if present
if "person_age" in df.columns:
    df.loc[df["person_age"] > 100, "person_age"] = 100
if "person_emp_length" in df.columns:
    df.loc[df["person_emp_length"] > 80, "person_emp_length"] = 80

# Clip tails for a couple of skewed numerics (adjust list as needed)
for col in ["person_income", "loan_amnt"]:
    if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
        df[col] = iqr_clip(df[col])

# ---------- 5) Impute missing values ----------
# numeric -> median; categorical -> most frequent
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != TARGET]
cat_cols = [c for c in df.columns if c not in num_cols + [TARGET]]

# Fill numeric NaNs
for c in num_cols:
    med = df[c].median()
    df[c] = df[c].fillna(med)

# Fill categorical NaNs
for c in cat_cols:
    mode_val = df[c].mode(dropna=True)
    mode_val = mode_val.iloc[0] if not mode_val.empty else "missing"
    df[c] = df[c].fillna(mode_val).astype(str)

# ---------- 6) One-Hot Encode categoricals (no drop_first) ----------
# We keep all dummies for interpretability; tree models don’t need scaling.
if len(cat_cols) > 0:
    df_encoded = pd.get_dummies(df, columns=cat_cols, prefix=cat_cols, drop_first=False)
else:
    df_encoded = df.copy()

# Ensure target is the last column (optional, nice for reading)
cols = [c for c in df_encoded.columns if c != TARGET] + [TARGET]
df_encoded = df_encoded[cols]

print("After encoding shape:", df_encoded.shape)

# Save full processed dataset
processed_all_path = OUT_DIR / "processed_all.csv"
df_encoded.to_csv(processed_all_path, index=False)
print("Saved:", processed_all_path.resolve())

# Save feature list (exclude target)
feature_cols = [c for c in df_encoded.columns if c != TARGET]
with open(OUT_DIR / "feature_list.txt", "w") as f:
    for c in feature_cols:
        f.write(c + "\n")

# ---------- 7) Stratified Train/Val/Test splits (on encoded data) ----------
X = df_encoded.drop(columns=[TARGET])
y = df_encoded[TARGET]

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

val_ratio_of_trainval = VAL_SIZE / (1 - TEST_SIZE)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=val_ratio_of_trainval,
    stratify=y_train_val,
    random_state=RANDOM_STATE
)

# Recombine to single CSVs with same columns
train_df = X_train.copy(); train_df[TARGET] = y_train.values
val_df   = X_val.copy();   val_df[TARGET]   = y_val.values
test_df  = X_test.copy();  test_df[TARGET]  = y_test.values

train_df.to_csv(OUT_DIR / "train.csv", index=False)
val_df.to_csv(OUT_DIR / "val.csv", index=False)
test_df.to_csv(OUT_DIR / "test.csv", index=False)

print("Train/Val/Test saved in:", OUT_DIR.resolve())
print("Train:", train_df.shape, "| Val:", val_df.shape, "| Test:", test_df.shape)

# ---------- 8) Quick report ----------
report_lines = []
report_lines.append("Preprocessing report")
report_lines.append(f"Original shape: {tuple(pd.read_csv(DATA_PATH, nrows=1).shape)} (rows unknown here)")
report_lines.append(f"Processed (encoded) shape: {df_encoded.shape}")
report_lines.append(f"Numeric cols (pre-encode): {len(num_cols)} | Categorical cols: {len(cat_cols)}")
report_lines.append("Outlier handling: capped person_age > 100, person_emp_length > 80; IQR clip on person_income & loan_amnt")
report_lines.append("Missing values: numeric->median, categorical->mode")
report_lines.append("Encoding: One-Hot (no drop_first)")
report_lines.append(f"Files: {processed_all_path.name}, train.csv, val.csv, test.csv, feature_list.txt")

with open(OUT_DIR / "report.txt", "w") as f:
    f.write("\n".join(report_lines))

print("\n✅ Preprocessing complete. Use:")
print("  - artifacts_preprocessing/processed_all.csv (full encoded data)")
print("  - artifacts_preprocessing/train.csv, val.csv, test.csv (for modeling)")
print("Next: Modeling notebook will read these CSVs directly.")


Loaded: /Users/mr.engineer/Desktop/Code/DSPROJECT/CREDITRISK/DATA/credit_risk_dataset.csv | shape: (32581, 12)
After encoding shape: (32581, 27)
Saved: /Users/mr.engineer/Desktop/Code/DSPROJECT/CREDITRISK/NOTEBOOK/artifacts_preprocessing/processed_all.csv
Train/Val/Test saved in: /Users/mr.engineer/Desktop/Code/DSPROJECT/CREDITRISK/NOTEBOOK/artifacts_preprocessing
Train: (19548, 27) | Val: (6516, 27) | Test: (6517, 27)

✅ Preprocessing complete. Use:
  - artifacts_preprocessing/processed_all.csv (full encoded data)
  - artifacts_preprocessing/train.csv, val.csv, test.csv (for modeling)
Next: Modeling notebook will read these CSVs directly.
