In [None]:
# ============================================================
# Home Credit Default Risk â€” Baseline Model (LogReg + OneHot)
# Robust, reproducible, VS Code/Jupyter friendly
# ============================================================

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

RANDOM_STATE = 42
TARGET_COL = "TARGET"
ID_COL = "SK_ID_CURR"

# ----------------------------
# 1) Load data (expects ./data/)
# ----------------------------
DATA_DIR = Path("./data")
train_path = DATA_DIR / "application_train.csv"
test_path  = DATA_DIR / "application_test.csv"

if not train_path.exists() or not test_path.exists():
    raise FileNotFoundError(
        "Missing CSVs.\n"
        "Expected:\n"
        f"  {train_path.resolve()}\n"
        f"  {test_path.resolve()}\n"
        "Fix: create ./data/ and put application_train.csv + application_test.csv inside."
    )

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# Basic integrity checks
if TARGET_COL not in train_df.columns:
    raise ValueError("TARGET column not found in training set.")
if ID_COL not in train_df.columns or ID_COL not in test_df.columns:
    raise ValueError("SK_ID_CURR missing from train/test.")

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)

y = train_df[TARGET_COL].astype(int)
X = train_df.drop(columns=[TARGET_COL]).copy()
X_test = test_df.copy()

# ----------------------------
# 2) Minimal, defensible cleaning
#    - Handle DAYS_EMPLOYED sentinel anomaly if present
# ----------------------------
if "DAYS_EMPLOYED" in X.columns:
    sentinel = 365243
    for df in (X, X_test):
        df["DAYS_EMPLOYED_ANOM"] = (df["DAYS_EMPLOYED"] == sentinel).astype(int)
        df.loc[df["DAYS_EMPLOYED"] == sentinel, "DAYS_EMPLOYED"] = np.nan

# (Optional but safe) Ensure IDs are not used as predictive features
# They can cause leakage-like behavior and add noise.
if ID_COL in X.columns:
    X = X.drop(columns=[ID_COL])
if ID_COL in X_test.columns:
    X_test = X_test.drop(columns=[ID_COL])

# ----------------------------
# 3) Column typing
# ----------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = [c for c in X.columns if c not in categorical_cols]

print(f"# numeric cols: {len(numeric_cols)} | # categorical cols: {len(categorical_cols)}")

# ----------------------------
# 4) Preprocessing pipeline (robust)
#    - Numeric: median impute + scale
#    - Categorical: most-frequent impute + one-hot
# ----------------------------
def make_ohe():
    # compatibility across sklearn versions
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=True)

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    # with_mean=False keeps it compatible with sparse matrices when combined with one-hot
    ("scaler", StandardScaler(with_mean=False)),
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", make_ohe()),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_cols),
        ("cat", categorical_pipe, categorical_cols),
    ],
    remainder="drop",
)

# ----------------------------
# 5) Baseline model
#    LogisticRegression with saga handles sparse one-hot well
# ----------------------------
baseline_model = LogisticRegression(
    solver="saga",
    penalty="l2",
    C=1.0,
    max_iter=3000,
    n_jobs=-1,
    random_state=RANDOM_STATE
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", baseline_model),
])

# ----------------------------
# 6) 5-fold Stratified CV ROC-AUC
# ----------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

scores = cross_val_score(
    clf,
    X,
    y,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    error_score="raise"  # fail fast if something breaks
)

mean_auc = float(np.mean(scores))
std_auc  = float(np.std(scores))

print("\n5-fold Stratified CV ROC-AUC:", np.round(scores, 6))
print("Mean ROC-AUC:", mean_auc)
print("Std  ROC-AUC:", std_auc)

# ----------------------------
# 7) Fit full model + create submission
# ----------------------------
clf.fit(X, y)

# predict_proba returns [P(class0), P(class1)]
test_pred = clf.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    TARGET_COL: test_pred
})

out_path = Path("submission_baseline_logreg.csv")
submission.to_csv(out_path, index=False)

print(f"\nSaved submission file: {out_path.resolve()}")
submission.head()


Train shape: (307511, 122)
Test shape : (48744, 121)
# numeric cols: 105 | # categorical cols: 16

5-fold Stratified CV ROC-AUC: [0.74178  0.752155 0.746526 0.750948 0.740897]
Mean ROC-AUC: 0.746461077383196
Std  ROC-AUC: 0.004591884832519444
