In [29]:
# Load Data
data_path = '../data/raw/credit_risk_dataset.csv'
df = pd.read_csv(data_path)
print(f"Original shape: {df.shape}")

Original shape: (32581, 12)


## Handle Duplicates

In [30]:
# Remove duplicates
df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")

Shape after removing duplicates: (32416, 12)


## Handle Missing Values

In [22]:
# Check missing values
missing = df.isnull().sum()
print("Missing values:")
print(missing[missing > 0])

# Impute person_emp_length with median
df['person_emp_length'] = df['person_emp_length'].fillna(df['person_emp_length'].median())

# Impute loan_int_rate with mean
df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].mean())

print("Missing values after imputation:")
print(df.isnull().sum())

Missing values:
person_emp_length     887
loan_int_rate        3095
dtype: int64
Missing values after imputation:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64


## Outlier Treatment

In [32]:
# Comprehensive Outlier Treatment
print("=== Before Outlier Treatment ===")
print(f"Original shape: {df.shape}")

# 1. Remove invalid/impossible values
print("\n1. Removing invalid values...")
# Remove impossible ages (> 80)
age_outliers = df[df['person_age'] > 100].shape[0]
df = df[df['person_age'] <= 100]
print(f"   Removed {age_outliers} records with age > 100")

# Remove impossible employment length (can't exceed working years)
emp_outliers = df[df['person_emp_length'] > (df['person_age'] - 18)].shape[0]
df = df[df['person_emp_length'] <= (df['person_age'] - 18)]
print(f"   Removed {emp_outliers} records with employment > working years")

# Remove extreme income values (> $500K)
income_outliers = df[df['person_income'] > 500000].shape[0]
df = df[df['person_income'] <= 500000]
print(f"   Removed {income_outliers} records with income > $500K")

# 2. Log transformation for skewed income data
print("\n2. Applying log transformation to income...")
df['person_income_log'] = np.log1p(df['person_income'])

# 3. Winsorization for remaining outliers
print("\n3. Applying capping...")
def winsorize_column(df, column, lower=0.01, upper=0.99):
    lower_bound = df[column].quantile(lower)
    upper_bound = df[column].quantile(upper)
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df

# Apply winsorization to key numerical columns
cols_to_winsorize = ['loan_amnt', 'loan_int_rate', 'loan_percent_income']
for col in cols_to_winsorize:
    df = winsorize_column(df, col)
    print(f"   Winsorized {col}")

# 4. Create robust features
print("\n4. Creating robust features...")
# Income per year of age
df['income_per_age'] = df['person_income'] / df['person_age']

# Loan to income ratio (already exists as loan_percent_income)
# Employment stability score
df['emp_stability'] = np.where(df['person_emp_length'] >= 5, 1, 0)

print(f"\n=== After Outlier Treatment ===")
print(f"Final shape: {df.shape}")
print(f"Records removed: {32416 - df.shape[0]} ({((32416 - df.shape[0])/32416)*100:.1f}%)")

# Show summary statistics after treatment
print("\n=== Summary Statistics After Treatment ===")
print(df[['person_age', 'person_income', 'person_emp_length', 'loan_amnt']].describe())

=== Before Outlier Treatment ===
Original shape: (23695, 15)

1. Removing invalid values...
   Removed 0 records with age > 100
   Removed 0 records with employment > working years
   Removed 0 records with income > $500K

2. Applying log transformation to income...

3. Applying capping...
   Winsorized loan_amnt
   Winsorized loan_int_rate
   Winsorized loan_percent_income

4. Creating robust features...

=== After Outlier Treatment ===
Final shape: (23695, 15)
Records removed: 8721 (26.9%)

=== Summary Statistics After Treatment ===
         person_age  person_income  person_emp_length     loan_amnt
count  23695.000000   23695.000000       23695.000000  23695.000000
mean      28.675839   63781.511247           3.468495   9422.978476
std        6.604860   41331.271286           3.295614   6053.441181
min       20.000000    4000.000000           0.000000   1200.000000
25%       24.000000   37200.000000           1.000000   5000.000000
50%       27.000000   54000.000000           3.0000

## Feature Engineering

In [25]:
# Create debt-to-income ratio
df['debt_to_income'] = df['loan_amnt'] / df['person_income']

# Create age groups
df['age_group'] = pd.cut(df['person_age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '56+'])

# Create income groups
df['income_group'] = pd.cut(df['person_income'], bins=[0, 30000, 60000, 100000, 200000, np.inf], labels=['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High'])

print("New features created.")

New features created.


## Encode Categorical Features

In [26]:
# Label encode ordinal features
le_loan_grade = LabelEncoder()
df['loan_grade_encoded'] = le_loan_grade.fit_transform(df['loan_grade'])

le_default = LabelEncoder()
df['cb_person_default_on_file_encoded'] = le_default.fit_transform(df['cb_person_default_on_file'])

# One-hot encode nominal features
categorical_cols = ['person_home_ownership', 'loan_intent', 'age_group', 'income_group']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Drop original string columns that were encoded
df_encoded = df_encoded.drop(['loan_grade', 'cb_person_default_on_file'], axis=1)

print("Categorical features encoded.")
print(f"Shape after encoding: {df_encoded.shape}")

Categorical features encoded.
Shape after encoding: (24530, 30)


## Feature Scaling

In [27]:
# Robust Feature Scaling
print("=== Feature Scaling ===")

# Select numerical features to scale
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 
                  'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 
                  'debt_to_income', 'person_income_log', 'income_per_age']

print(f"Features to scale: {len(numerical_cols)}")
print(f"Features: {numerical_cols}")

# Use RobustScaler for better outlier handling
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

# Fit and transform numerical features
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

print("✓ Numerical features scaled using RobustScaler")
print("✓ RobustScaler uses median and IQR (less sensitive to outliers)")

# Show scaling statistics
print("\n=== Scaling Statistics ===")
scaled_stats = df_encoded[numerical_cols].describe()
print(scaled_stats.loc[['mean', 'std', 'min', 'max']])

=== Feature Scaling ===
Features to scale: 10
Features: ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'debt_to_income', 'person_income_log', 'income_per_age']
✓ Numerical features scaled using RobustScaler
✓ RobustScaler uses median and IQR (less sensitive to outliers)

=== Scaling Statistics ===
      person_age  person_income  person_emp_length  loan_amnt  loan_int_rate  \
mean    0.204612       0.224700           0.121698   0.191680       0.012899   
std     0.820104       1.014580           0.810105   0.863171       0.655118   
min    -0.875000      -1.231436          -0.750000  -1.000000      -1.208913   
max     6.625000      10.984410           9.500000   2.857143       1.592383   

      loan_percent_income  cb_person_cred_hist_length  debt_to_income  \
mean             0.152638                    0.222381        0.155892   
std              0.749403                    0.716012        0.76

## Data Splitting

In [15]:
# Separate features and target
X = df_encoded.drop('loan_status', axis=1)
y = df_encoded['loan_status']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"Train target distribution: {y_train.value_counts(normalize=True)}")
print(f"Test target distribution: {y_test.value_counts(normalize=True)}")

Train shape: (19624, 29), Test shape: (4906, 29)
Train target distribution: loan_status
0    0.770791
1    0.229209
Name: proportion, dtype: float64
Test target distribution: loan_status
0    0.770689
1    0.229311
Name: proportion, dtype: float64


In [35]:
y.unique()

array([1, 0])

In [29]:
## Data Summary

print("=== Dataset Summary ===")
print(f"Original data: {32416:,} records")
print(f"After outlier removal: {df.shape[0]:,} records")
print(f"After preprocessing: {df_encoded.shape}")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

print(f"\n=== Outlier Treatment Summary ===")
print(f"Records removed: {32416 - df.shape[0]:,} ({((32416 - df.shape[0])/32416)*100:.1f}%)")
print("   - Age > 80 years")
print("   - Employment > working years")
print("   - Income > $500K")
print("   - Applied log transformation to income")
print("   - Applied winsorization to loan features")

print(f"\n=== Class Distribution ===")
print("Training set:")
print(f"  Class 0 (No Default): {y_train.value_counts()[0]:,} ({y_train.value_counts(normalize=True)[0]:.1%})")
print(f"  Class 1 (Default): {y_train.value_counts()[1]:,} ({y_train.value_counts(normalize=True)[1]:.1%})")
print("Test set:")
print(f"  Class 0 (No Default): {y_test.value_counts()[0]:,} ({y_test.value_counts(normalize=True)[0]:.1%})")
print(f"  Class 1 (Default): {y_test.value_counts()[1]:,} ({y_test.value_counts(normalize=True)[1]:.1%})")

print(f"\n=== Data Quality ===")
print(f"Missing values: {X_train.isnull().sum().sum()}")
print(f"Features: {X_train.shape[1]}")
print(f"Scaling method: RobustScaler (median & IQR)")
print("✓ Data ready for modeling")

=== Dataset Summary ===
Original data: 32,416 records
After outlier removal: 24,530 records
After preprocessing: (24530, 30)
Training set: (19624, 29)
Test set: (4906, 29)

=== Outlier Treatment Summary ===
Records removed: 7,886 (24.3%)
   - Age > 80 years
   - Employment > working years
   - Income > $500K
   - Applied log transformation to income
   - Applied winsorization to loan features

=== Class Distribution ===
Training set:
  Class 0 (No Default): 15,126 (77.1%)
  Class 1 (Default): 4,498 (22.9%)
Test set:
  Class 0 (No Default): 3,781 (77.1%)
  Class 1 (Default): 1,125 (22.9%)

=== Data Quality ===


Missing values: 0
Features: 29
Scaling method: RobustScaler (median & IQR)
✓ Data ready for modeling


In [34]:
## Save Processed Data

import os
import joblib

# Create directories
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Save training and test data
X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv')

# Save preprocessing objects
joblib.dump(scaler, '../models/robust_scaler.pkl')
joblib.dump(le_loan_grade, '../models/le_loan_grade.pkl')
joblib.dump(le_default, '../models/le_default.pkl')

# Save outlier treatment parameters
outlier_params = {
    'max_age': 80,
    'max_income': 500000,
    'winsorize_bounds': (0.01, 0.99),
    'log_transform_income': True
}
joblib.dump(outlier_params, '../models/outlier_params.pkl')

print("=== Data Saved Successfully ===")
print(f"✓ Training data: {X_train.shape}")
print(f"✓ Test data: {X_test.shape}")
print(f"✓ RobustScaler saved")
print(f"✓ Label encoders saved")
print(f"✓ Outlier treatment parameters saved")
print(f"✓ Ready for model training")

=== Data Saved Successfully ===
✓ Training data: (19624, 29)
✓ Test data: (4906, 29)
✓ RobustScaler saved
✓ Label encoders saved
✓ Outlier treatment parameters saved
✓ Ready for model training
