In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
# from imblearn.over_sampling import SMOTE  # Commented out due to version conflict
import joblib
import os

In [2]:
# Load Data
data_path = '../data/raw/credit_risk_dataset.csv'
df = pd.read_csv(data_path)
print(f"Original shape: {df.shape}")

Original shape: (32581, 12)


## Handle Duplicates

In [3]:
# Remove duplicates
df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")

Shape after removing duplicates: (32416, 12)


## Handle Missing Values

In [4]:
# Check missing values
missing = df.isnull().sum()
print("Missing values:")
print(missing[missing > 0])

# Impute person_emp_length with median
df['person_emp_length'] = df['person_emp_length'].fillna(df['person_emp_length'].median())

# Impute loan_int_rate with mean
df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].mean())

print("Missing values after imputation:")
print(df.isnull().sum())

Missing values:
person_emp_length     887
loan_int_rate        3095
dtype: int64
Missing values after imputation:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64


## Outlier Treatment

In [5]:
# Cap outliers for person_age (reasonable max 100)
df['person_age'] = np.where(df['person_age'] > 100, 100, df['person_age'])

# Cap person_emp_length (reasonable max 50 years)
df['person_emp_length'] = np.where(df['person_emp_length'] > 50, 50, df['person_emp_length'])

# Cap loan_percent_income (max 1.0)
df['loan_percent_income'] = np.where(df['loan_percent_income'] > 1.0, 1.0, df['loan_percent_income'])

print("Outliers capped.")

Outliers capped.


## Feature Engineering

In [6]:
# Create debt-to-income ratio
df['debt_to_income'] = df['loan_amnt'] / df['person_income']

# Create age groups
df['age_group'] = pd.cut(df['person_age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '56+'])

# Create income groups
df['income_group'] = pd.cut(df['person_income'], bins=[0, 30000, 60000, 100000, 200000, np.inf], labels=['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High'])

print("New features created.")

New features created.


## Encode Categorical Features

In [7]:
# Label encode ordinal features
le_loan_grade = LabelEncoder()
df['loan_grade_encoded'] = le_loan_grade.fit_transform(df['loan_grade'])

le_default = LabelEncoder()
df['cb_person_default_on_file_encoded'] = le_default.fit_transform(df['cb_person_default_on_file'])

# One-hot encode nominal features
categorical_cols = ['person_home_ownership', 'loan_intent', 'age_group', 'income_group']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Drop original string columns that were encoded
df_encoded = df_encoded.drop(['loan_grade', 'cb_person_default_on_file'], axis=1)

print("Categorical features encoded.")
print(f"Shape after encoding: {df_encoded.shape}")

Categorical features encoded.
Shape after encoding: (32416, 27)


## Feature Scaling

In [8]:
# Select numerical features to scale
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'debt_to_income']

scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

print("Numerical features scaled.")

Numerical features scaled.


## Data Splitting

In [9]:
# Separate features and target
X = df_encoded.drop('loan_status', axis=1)
y = df_encoded['loan_status']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"Train target distribution: {y_train.value_counts(normalize=True)}")
print(f"Test target distribution: {y_test.value_counts(normalize=True)}")

Train shape: (25932, 26), Test shape: (6484, 26)
Train target distribution: loan_status
0    0.781313
1    0.218687
Name: proportion, dtype: float64
Test target distribution: loan_status
0    0.781308
1    0.218692
Name: proportion, dtype: float64


## Handle Imbalanced Data

In [10]:
# Handle Imbalanced Data
# Apply SMOTE to training data
# smote = SMOTE(random_state=42)
# X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
# print(f"After SMOTE - Train shape: {X_train_sm.shape}")
# print(f"Train target distribution after SMOTE: {y_train_sm.value_counts(normalize=True)}")

# For now, use original data
X_train_sm = X_train
y_train_sm = y_train
print("Using original training data (imbalance handled with class weights in modeling).")

Using original training data (imbalance handled with class weights in modeling).


## Save Processed Data

In [11]:
# Save processed data
os.makedirs('../data/processed', exist_ok=True)
X_train_sm.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train_sm.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

# Save preprocessing objects
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(le_loan_grade, '../models/le_loan_grade.pkl')
joblib.dump(le_default, '../models/le_default.pkl')

print("Processed data and preprocessing objects saved.")

Processed data and preprocessing objects saved.
