In [2]:
import joblib
import pandas as pd
# import optuna
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
from sklearn.metrics import f1_score, recall_score, precision_score
from imblearn.pipeline import Pipeline as imPipeline
from imblearn.over_sampling import ADASYN
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = joblib.load("../src/cleaned_loan_df.pkl")

In [5]:
df.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [6]:
# Define Feature Groups

binary_features = ['HasCoSigner', 'HasMortgage', 'HasDependents']
binary_order = [["No", "Yes"]] * len(binary_features)

numeric_features = ['Age', 'Income', 'LoanAmount', 'CreditScore',
                    'MonthsEmployed', 'NumCreditLines', 'InterestRate',
                    'LoanTerm', 'DTIRatio']

categorical_features = ['EmploymentType', 'MaritalStatus', 'LoanPurpose']
ordinal_feature = ['Education']
education_order = [["High School", "Bachelor's", "Master's", "PhD"]]



In [7]:
features_dict = {
    "binary_features": binary_features,
    "binary_order": binary_order,
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
    "ordinal_feature": ordinal_feature,
    "education_order": education_order
}

joblib.dump(features_dict, "../src/features_config.pkl")


['../src/features_config.pkl']

In [9]:

# Encoding (leave numeric features untouched for now)
encoding = ColumnTransformer([
    ("num", "passthrough", numeric_features),
    ("cat", OneHotEncoder(drop="first"), categorical_features),
    ("bin", OrdinalEncoder(categories=binary_order), binary_features),
    ("ord", OrdinalEncoder(categories=education_order), ordinal_feature),
])

# Scaling: will apply AFTER SMOTE
scaling = ColumnTransformer([
    ("scale", StandardScaler(), slice(0, len(numeric_features)))
], remainder="passthrough")  # All others stay untouched

# Save the transformers
joblib.dump(encoding, "../src/encoder.pkl")
joblib.dump(scaling, "../src/scaler.pkl")



['../src/scaler.pkl']

In [11]:
X = df.drop("Default", axis=1)
y = df["Default"]
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.3, random_state=42)

In [12]:
encoding.fit_transform(X_train)

array([[ 0.  ,  1.  ,  0.  , ..., 24.99, 24.  ,  0.88],
       [ 0.  ,  0.  ,  1.  , ..., 23.17, 60.  ,  0.87],
       [ 0.  ,  0.  ,  1.  , ...,  5.03, 36.  ,  0.44],
       ...,
       [ 0.  ,  0.  ,  0.  , ...,  9.72, 60.  ,  0.24],
       [ 0.  ,  1.  ,  0.  , ...,  4.31, 48.  ,  0.3 ],
       [ 1.  ,  0.  ,  0.  , ..., 23.14, 24.  ,  0.83]])