# Feature Engineering

Transform cleaned data into model-ready features: one-hot encoding, standardization, train/val/test split, and SMOTE for class balance.

In [1]:
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Configuration
DATA_DIR = "../data"
INPUT_FILE = os.path.join(DATA_DIR, "cleaned_loan_data.csv")

TRAIN_FILE = os.path.join(DATA_DIR, "train.csv")
VAL_FILE = os.path.join(DATA_DIR, "val.csv")
TEST_FILE = os.path.join(DATA_DIR, "test.csv")

TARGET_COL = "status"
ID_COL = "id"

NUMERIC_COLS = ["term", "credit_score", "ltv", "dtir1",
                "loan_amount", "income", "property_value", "year"]

CATEGORICAL_COLS = ["loan_limit", "gender", "approv_in_adv", "loan_type", "loan_purpose",
                    "credit_worthiness", "open_credit", "business_or_commercial", "neg_ammortization",
                    "interest_only", "lump_sum_payment", "construction_type", "occupancy_type",
                    "secured_by", "total_units", "credit_type", "co-applicant_credit_type",
                    "age", "submission_of_application", "region", "security_type"]

In [2]:
def load_clean_data(path):
    return pd.read_csv(path)

In [3]:
def build_preprocessor():
    # Build ColumnTransformer: one-hot encode categorical, standardize numerical
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=False
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, NUMERIC_COLS),
            ("cat", categorical_transformer, CATEGORICAL_COLS),
        ]
    )

    return preprocessor

In [4]:
def train_val_test_split(X, y, random_state=42):
    # Create 80/10/10 train/val/test split with stratification
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.20, stratify=y, random_state=random_state
    )

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=random_state
    )

    return X_train, X_val, X_test, y_train, y_val, y_test

In [5]:
def apply_smote(X_train, y_train, random_state=42):
    # Apply SMOTE to training set only to address class imbalance
    smote = SMOTE(random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

In [6]:
# Load data
df = load_clean_data(INPUT_FILE)

y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

if ID_COL in X.columns:
    X = X.drop(columns=[ID_COL])

print(f"Data loaded: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

Data loaded: (148670, 29)
Target distribution: {0: 112031, 1: 36639}


In [7]:
# Validate feature columns
expected_cols = set(NUMERIC_COLS + CATEGORICAL_COLS)
missing = expected_cols - set(X.columns)
extra = set(X.columns) - expected_cols

if missing:
    raise ValueError(f"Columns listed in NUMERIC/CATEGORICAL_COLS not in data: {missing}")
if extra:
    print(f"Warning: extra columns not in NUMERIC/CATEGORICAL_COLS: {extra}")

In [8]:
# Split into train/val/test
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = train_val_test_split(X, y)

print(f"Train: {X_train_raw.shape[0]} samples")
print(f"Val: {X_val_raw.shape[0]} samples")
print(f"Test: {X_test_raw.shape[0]} samples")

Train: 118936 samples
Val: 14867 samples
Test: 14867 samples


In [9]:
# Build and fit preprocessor
preprocessor = build_preprocessor()

X_train_processed = preprocessor.fit_transform(X_train_raw)
X_val_processed = preprocessor.transform(X_val_raw)
X_test_processed = preprocessor.transform(X_test_raw)

print(f"Features after preprocessing: {X_train_processed.shape[1]}")

Features after preprocessing: 67


In [10]:
# Recover feature names after one-hot encoding
ohe = preprocessor.named_transformers_["cat"]
cat_feature_names = list(ohe.get_feature_names_out(CATEGORICAL_COLS))
num_feature_names = NUMERIC_COLS
all_feature_names = num_feature_names + cat_feature_names

# Convert to DataFrames
X_train_df = pd.DataFrame(X_train_processed, columns=all_feature_names, index=X_train_raw.index)
X_val_df = pd.DataFrame(X_val_processed, columns=all_feature_names, index=X_val_raw.index)
X_test_df = pd.DataFrame(X_test_processed, columns=all_feature_names, index=X_test_raw.index)

In [11]:
# Apply SMOTE to training set only
X_train_balanced, y_train_balanced = apply_smote(X_train_df, y_train)

X_train_balanced_df = pd.DataFrame(X_train_balanced, columns=all_feature_names)

print(f"Training set after SMOTE: {X_train_balanced_df.shape[0]} samples")
print(f"Class distribution: {pd.Series(y_train_balanced).value_counts().to_dict()}")

Training set after SMOTE: 179250 samples
Class distribution: {0: 89625, 1: 89625}


In [12]:
# Save train/val/test splits
os.makedirs(DATA_DIR, exist_ok=True)

train_out = X_train_balanced_df.copy()
train_out[TARGET_COL] = y_train_balanced
train_out.to_csv(TRAIN_FILE, index=False)

val_out = X_val_df.copy()
val_out[TARGET_COL] = y_val
val_out.to_csv(VAL_FILE, index=False)

test_out = X_test_df.copy()
test_out[TARGET_COL] = y_test
test_out.to_csv(TEST_FILE, index=False)

print(f"Saved train set to {TRAIN_FILE} (shape: {train_out.shape})")
print(f"Saved validation set to {VAL_FILE} (shape: {val_out.shape})")
print(f"Saved test set to {TEST_FILE} (shape: {test_out.shape})")

Saved train set to ../data/train.csv (shape: (179250, 68))
Saved validation set to ../data/val.csv (shape: (14867, 68))
Saved test set to ../data/test.csv (shape: (14867, 68))


In [13]:
# Save preprocessor for future use (DiCE counterfactuals)
MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)

preprocessor_path = os.path.join(MODELS_DIR, "preprocessor.pkl")
joblib.dump(preprocessor, preprocessor_path)
print(f"Saved preprocessor to {preprocessor_path}")

# Save feature names for reference
feature_names_path = os.path.join(MODELS_DIR, "feature_names.txt")
with open(feature_names_path, 'w') as f:
    for fname in all_feature_names:
        f.write(f"{fname}\n")
print(f"Saved feature names to {feature_names_path}")

Saved preprocessor to ../models/preprocessor.pkl
Saved feature names to ../models/feature_names.txt
