# Cell A → Imports.

In [1]:
# Cell A — imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


# Cell B → Load cleaned dataset (loan_clean.csv).

In [2]:
# Cell B — load cleaned dataset
df = pd.read_csv("../data/data_processed/loan_clean.csv")
print("Shape:", df.shape)
df.head()


Shape: (255347, 17)


Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,target
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,1,Yes,Other,1,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,0,No,Other,1,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,1,Yes,Auto,0,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,0,No,Business,0,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,0,Yes,Auto,0,0


# Cell C → Separate features (X) and target (y).

In [3]:
# Cell C — separate X and y
X = df.drop(columns=["target"])
y = df["target"]

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Target distribution:\n", y.value_counts(normalize=True).round(3))


X shape: (255347, 16)
y shape: (255347,)
Target distribution:
 target
0    0.884
1    0.116
Name: proportion, dtype: float64


# Cell D → Stratified train/test split.

In [4]:
# Cell D — stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)
print("Train target distribution:\n", y_train.value_counts(normalize=True))
print("Test target distribution:\n", y_test.value_counts(normalize=True))



Train size: (204277, 16) Test size: (51070, 16)
Train target distribution:
 target
0    0.883873
1    0.116127
Name: proportion, dtype: float64
Test target distribution:
 target
0    0.883865
1    0.116135
Name: proportion, dtype: float64


# Cell E → Identify numeric vs categorical columns.

In [5]:
# Cell E — identify column types

numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

print("Numeric columns:", numeric_cols[:10], "... total:", len(numeric_cols))
print("Categorical columns:", categorical_cols[:10], "... total:", len(categorical_cols))


Numeric columns: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'HasMortgage'] ... total: 11
Categorical columns: ['Education', 'EmploymentType', 'MaritalStatus', 'HasDependents', 'LoanPurpose'] ... total: 5


# Cell F → Build preprocessing pipelines

In [6]:
# Cell F — preprocessing pipelines

# Numeric: median imputation + scaling
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical: most frequent imputation + one-hot encoding
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine into ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numeric_cols),
        ("cat", cat_pipeline, categorical_cols)
    ]
)

print("✅ Preprocessor ready")


✅ Preprocessor ready


# Cell G — Test Preprocessor on Train Data

In [7]:
# Cell G — sanity check preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print("Transformed train shape:", X_train_transformed.shape)
print("Transformed test shape:", X_test_transformed.shape)


Transformed train shape: (204277, 29)
Transformed test shape: (51070, 29)
