Step 1: Load Data

We load the feature-engineered dataset using paths from config.yaml. This ensures our pipeline is path-agnostic and reproducible

In [3]:
# Imports
import pandas as pd
import numpy as np
import yaml
from pathlib import Path

# Sklearn & pipeline bits
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# Save artifacts
import joblib

# Load config & dataset
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

featured_path = Path(config['featured_data_path'])
df = pd.read_csv(featured_path)

print("Loaded feature-engineered dataset:", df.shape)
df.head()

Loaded feature-engineered dataset: (1344079, 22)


Unnamed: 0,loan_amnt,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,purpose,...,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,fico_range_low,fico_range_high,loan_status,credit_history_years
0,3600.0,36,13.99,123.03,C,10.0,MORTGAGE,55000.0,Not Verified,debt_consolidation,...,1.0,7.0,0.0,2765.0,29.7,13.0,675.0,679.0,0,22
1,24700.0,36,11.99,820.28,C,10.0,MORTGAGE,65000.0,Not Verified,small_business,...,4.0,22.0,0.0,21470.0,19.2,38.0,715.0,719.0,0,26
2,20000.0,60,10.78,432.66,B,10.0,MORTGAGE,63000.0,Not Verified,home_improvement,...,0.0,6.0,0.0,7869.0,56.2,18.0,695.0,699.0,0,25
3,10400.0,60,22.45,289.91,F,3.0,MORTGAGE,104433.0,Source Verified,major_purchase,...,3.0,12.0,0.0,21929.0,64.5,35.0,695.0,699.0,0,27
4,11950.0,36,13.44,405.18,C,4.0,RENT,34000.0,Source Verified,debt_consolidation,...,0.0,5.0,0.0,8822.0,68.4,6.0,690.0,694.0,0,38


Step 2: Target & Feature Groups

We split features into:

Numeric: impute by median, scale

Ordinal categorical (grade): encoded with ordered mapping A→G

Nominal categorical (home_ownership, verification_status, purpose): one-hot encoded

In [4]:
# Target
TARGET = 'loan_status'
y = df[TARGET].astype(int)

# Feature lists
# Numeric (already numeric in FE): keep all numeric except the target
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if TARGET in numeric_features:
    numeric_features.remove(TARGET)

# Categorical — split ordinal vs nominal
# Ordinal: 'grade' (A < B < ... < G)
ordinal_features = ['grade']

# Nominal categoricals: no natural order
nominal_features = ['home_ownership', 'verification_status', 'purpose']

# Sanity check: make sure they exist
print("Numeric features:", numeric_features[:10], '... total:', len(numeric_features))
print("Ordinal features:", ordinal_features)
print("Nominal features:", nominal_features)

# X matrix
X = df.drop(columns=[TARGET])

# Step: Save feature info for later steps (e.g., Feature Importance in 06)
import json
feature_info = {
    "numeric_features": numeric_features,
    "ordinal_features": ordinal_features,
    "nominal_features": nominal_features
}
with open('../config/feature_info.json', 'w') as f:
    json.dump(feature_info, f)
print("Feature info saved to ../config/feature_info.json")

Numeric features: ['loan_amnt', 'term', 'int_rate', 'installment', 'emp_length', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc'] ... total: 17
Ordinal features: ['grade']
Nominal features: ['home_ownership', 'verification_status', 'purpose']
Feature info saved to ../config/feature_info.json


Step 3: Train/Test Split

We split with stratification to preserve the default/non-default ratio in both sets. This matters because the dataset is imbalanced.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Class balance (train):")
print(y_train.value_counts(normalize=True).round(3))


Train shape: (1075263, 21)  Test shape: (268816, 21)
Class balance (train):
loan_status
0    0.8
1    0.2
Name: proportion, dtype: float64


Step 4: Column Pipelines

Numerics → impute median (robust to outliers), then scale (standardize).

Ordinal (grade) → impute then OrdinalEncoder with explicit order.

Nominal → impute then One-Hot Encoder with handle_unknown='ignore'.
We wrap all into a ColumnTransformer so the same preprocessing can be fitted once on train and reused everywhere.

In [6]:
# Ordinal order for grade (A best … G worst).
# You can decide whether to encode A=1..G=7 or the reverse; here we encode A..G ascending.
grade_order = [['A','B','C','D','E','F','G']]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # turn off later if you only use tree models
])

ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordenc', OrdinalEncoder(categories=grade_order))
])

nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop=None))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('ord', ordinal_pipeline, ordinal_features),
        ('nom', nominal_pipeline, nominal_features),
    ],
    remainder='drop'  # drop unexpected columns
)

preprocessor


0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['A', 'B', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


Step 5: Fit & Transform

We fit the preprocessing only on the training set to avoid leakage, then transform both train and test sets.

In [7]:
# Fit preprocessor on train only (prevents leakage)
preprocessor.fit(X_train)

# Transform
X_train_proc = preprocessor.transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

print("Processed shapes - X_train:", X_train_proc.shape, "  X_test:", X_test_proc.shape)


Processed shapes - X_train: (1075263, 41)   X_test: (268816, 41)


We save:

Processed matrices (X_train, X_test, y_train, y_test)
The fitted preprocessor (preprocessor.joblib)
This keeps `06_modeling.ipynb` notebook clean and reproducible — just load, fit a model, done.

In [8]:
# Create folders if they don't exist
Path("../data/processed").mkdir(parents=True, exist_ok=True)
Path("../models").mkdir(parents=True, exist_ok=True)

# Save arrays
np.save("../data/processed/X_train.npy", X_train_proc)
np.save("../data/processed/X_test.npy", X_test_proc)
np.save("../data/processed/y_train.npy", y_train.values)
np.save("../data/processed/y_test.npy", y_test.values)

# Save the fitted preprocessor
joblib.dump(preprocessor, "../models/preprocessor.joblib")

print("Saved processed matrices and preprocessor.")


Saved processed matrices and preprocessor.


Last check to verify: 
- No NaNs remain in the processed matrices
- Class ratios are similar in train vs test (stratification worked)

In [9]:
# Check imputation actually removed NaNs
import scipy  # only if you installed it; otherwise skip
print("Any NaNs left in X_train_proc?", np.isnan(X_train_proc).any())
print("Any NaNs left in X_test_proc?", np.isnan(X_test_proc).any())

# Verify stratification roughly preserved class ratio
print("Train class ratio:", y_train.mean().round(3))
print("Test  class ratio:", y_test.mean().round(3))


Any NaNs left in X_train_proc? False
Any NaNs left in X_test_proc? False
Train class ratio: 0.2
Test  class ratio: 0.2
