In [21]:


import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

print("Python:", sys.version)
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("sklearn:", sklearn.__version__)
print("matplotlib:", matplotlib.__version__)


Python: 3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]
pandas: 1.5.3
numpy: 1.24.4
sklearn: 1.4.0
matplotlib: 3.8.3


In [22]:
from pathlib import Path

DATA_DIR = Path("./data")
train_path = DATA_DIR / "application_train.csv"
test_path  = DATA_DIR / "application_test.csv"

In [None]:
RANDOM_STATE = 42
TARGET_COL = "TARGET"
ID_COL = "SK_ID_CURR"

import sys, sklearn, matplotlib
print("Python:", sys.version.split()[0])
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("sklearn:", sklearn.__version__)
print("matplotlib:", matplotlib.__version__)

Python: 3.11.7
pandas: 1.5.3
numpy: 1.24.4
sklearn: 1.4.0
matplotlib: 3.8.3


In [24]:
if not train_path.exists() or not test_path.exists():
    raise FileNotFoundError(
        "Could not find CSV files.\n"
        "Fix: place them here:\n"
        f"  {train_path.resolve()}\n"
        f"  {test_path.resolve()}\n"
        "Folder structure should be:\n"
        "  <project>/data/application_train.csv\n"
        "  <project>/data/application_test.csv"
    )

app_train = pd.read_csv(train_path)
app_test = pd.read_csv(test_path)

print("\nTrain shape:", app_train.shape)
print("Test shape :", app_test.shape)

assert TARGET_COL in app_train.columns, "TARGET column missing in train file."
assert ID_COL in app_train.columns and ID_COL in app_test.columns, "SK_ID_CURR missing."



Train shape: (307511, 122)
Test shape : (48744, 121)


In [25]:
y = app_train[TARGET_COL].astype(int)
default_rate = float(y.mean())

print(f"\nDefault rate (TARGET=1): {default_rate:.6f} ({default_rate*100:.3f}%)")

missing_pct = app_train.isna().mean().sort_values(ascending=False)
print("\nTop 15 columns by % missing:")
print((missing_pct.head(15) * 100).round(2))

# Save a CSV of missingness (optional, useful)
missing_out = Path("missingness_profile.csv")
(missing_pct * 100).to_csv(missing_out, header=["pct_missing"])
print(f"\nSaved missingness profile: {missing_out.resolve()}")


Default rate (TARGET=1): 0.080729 (8.073%)

Top 15 columns by % missing:
COMMONAREA_MEDI             69.87
COMMONAREA_AVG              69.87
COMMONAREA_MODE             69.87
NONLIVINGAPARTMENTS_MODE    69.43
NONLIVINGAPARTMENTS_AVG     69.43
NONLIVINGAPARTMENTS_MEDI    69.43
FONDKAPREMONT_MODE          68.39
LIVINGAPARTMENTS_MODE       68.35
LIVINGAPARTMENTS_AVG        68.35
LIVINGAPARTMENTS_MEDI       68.35
FLOORSMIN_AVG               67.85
FLOORSMIN_MODE              67.85
FLOORSMIN_MEDI              67.85
YEARS_BUILD_MEDI            66.50
YEARS_BUILD_MODE            66.50
dtype: float64

Saved missingness profile: C:\Users\snehs\Desktop\MSE-546\missingness_profile.csv


In [26]:

# Figure 1: Target distribution with % labels
fig1_path = Path("fig1_target_distribution.png")
counts = y.value_counts().sort_index()
total = counts.sum()
pct = (counts / total * 100).round(2)

plt.figure()
bars = plt.bar(["0 (repaid)", "1 (difficulty)"], counts.values)
plt.title("TARGET distribution (class imbalance)")
plt.ylabel("Count")

# add % labels on bars
for b, p in zip(bars, pct.values):
    plt.text(b.get_x() + b.get_width()/2, b.get_height(),
             f"{p:.2f}%", ha="center", va="bottom")

plt.tight_layout()
plt.savefig(fig1_path, dpi=200)
plt.close()
print(f"Saved: {fig1_path.resolve()}")


# Missingness summary stats
missing_pct = app_train.isna().mean().sort_values(ascending=False)
n_cols = app_train.shape[1]
cols_with_missing = int((missing_pct > 0).sum())
cols_missing_over_50 = int((missing_pct > 0.50).sum())

print(f"\nMissingness summary:")
print(f"- Columns with any missing values: {cols_with_missing}/{n_cols} ({cols_with_missing/n_cols*100:.1f}%)")
print(f"- Columns with >50% missing values: {cols_missing_over_50}/{n_cols} ({cols_missing_over_50/n_cols*100:.1f}%)")


# Figure 2: Missingness Top 20 
fig2_path = Path("fig2_missingness_top20.png")
top20 = (missing_pct.head(20) * 100).sort_values(ascending=True)

plt.figure(figsize=(8, 6))
plt.barh(top20.index, top20.values)
plt.title("Top 20 features by % missing (application_train)")
plt.xlabel("% missing")
plt.tight_layout()
plt.savefig(fig2_path, dpi=200)
plt.close()
print(f"Saved: {fig2_path.resolve()}")


# Figure 3: DAYS_EMPLOYED sentinel anomaly 
fig3_path = Path("fig3_days_employed_hist.png")

if "DAYS_EMPLOYED" in app_train.columns:
    sentinel = 365243
    anom_mask = (app_train["DAYS_EMPLOYED"] == sentinel)
    anom_count = int(anom_mask.sum())
    anom_rate = float(anom_mask.mean())
    print(f"\nDAYS_EMPLOYED sentinel count (train): {anom_count} ({anom_rate*100:.3f}%)")

    # add anomaly flag + replace sentinel with NaN
    for df in (app_train, app_test):
        df["DAYS_EMPLOYED_ANOM"] = (df["DAYS_EMPLOYED"] == sentinel).astype(int)
        df.loc[df["DAYS_EMPLOYED"] == sentinel, "DAYS_EMPLOYED"] = np.nan

    # histogram after cleaning (sentinel removed)
    cleaned = app_train["DAYS_EMPLOYED"].dropna()

    plt.figure(figsize=(8, 4))
    plt.hist(cleaned, bins=60)
    plt.title("DAYS_EMPLOYED distribution (sentinel removed)")
    plt.xlabel("DAYS_EMPLOYED")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(fig3_path, dpi=200)
    plt.close()

    print(f"Saved: {fig3_path.resolve()}")
else:
    print("\nDAYS_EMPLOYED not found; skipping Figure 3.")


# Figure 4: Univariate signal
fig4_path = Path("fig4_extsource_signal.png")

candidate = None
for col in ["EXT_SOURCE_2", "EXT_SOURCE_3", "EXT_SOURCE_1"]:
    if col in app_train.columns:
        candidate = col
        break

if candidate is not None:
    # plot distributions by class
    s0 = app_train.loc[y == 0, candidate].dropna()
    s1 = app_train.loc[y == 1, candidate].dropna()

    plt.figure(figsize=(8, 4))
    plt.hist(s0, bins=50, alpha=0.7, label="TARGET=0")
    plt.hist(s1, bins=50, alpha=0.7, label="TARGET=1")
    plt.title(f"{candidate} distribution by TARGET (univariate signal)")
    plt.xlabel(candidate)
    plt.ylabel("Count")
    plt.legend()
    plt.tight_layout()
    plt.savefig(fig4_path, dpi=200)
    plt.close()

    print(f"Saved: {fig4_path.resolve()}")
else:
    print("No EXT_SOURCE features found; skipping optional Figure 4.")

Saved: C:\Users\snehs\Desktop\MSE-546\fig1_target_distribution.png

Missingness summary:
- Columns with any missing values: 67/122 (54.9%)
- Columns with >50% missing values: 41/122 (33.6%)
Saved: C:\Users\snehs\Desktop\MSE-546\fig2_missingness_top20.png

DAYS_EMPLOYED sentinel count (train): 55374 (18.007%)
Saved: C:\Users\snehs\Desktop\MSE-546\fig3_days_employed_hist.png
Saved: C:\Users\snehs\Desktop\MSE-546\fig4_extsource_signal.png
