# Data Cleaning

Clean the raw Loan Default dataset by handling missing values, standardizing column names, and applying necessary transformations.

In [1]:
import pandas as pd
import numpy as np

# Configuration
NUMERIC_COLS = ["term", "credit_score", "ltv", "dtir1", "loan_amount", "income", "property_value"]
CATEGORICAL_COLS = ["loan_limit", "gender", "approv_in_adv", "loan_type", "loan_purpose",
                    "credit_worthiness", "open_credit", "business_or_commercial", "neg_ammortization",
                    "interest_only", "lump_sum_payment", "construction_type", "occupancy_type",
                    "secured_by", "total_units", "credit_type", "co-applicant_credit_type",
                    "age", "submission_of_application", "region", "security_type"] 
LOG_TRANSFORM_COLS = ["loan_amount", "income", "property_value"]
DROP_COLS = ["rate_of_interest", "interest_rate_spread", "upfront_charges"]

In [2]:
def load_data(path):
    return pd.read_csv(path)

In [3]:
def impute_missing_values(df):
    # For numeric columns, replace missing values with median
    for col in NUMERIC_COLS:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())

    # For categorical columns, replace missing values with mode
    for col in CATEGORICAL_COLS:
        df[col] = df[col].fillna(df[col].mode()[0])

    return df

In [4]:
def log_transform(df):
    # Apply log transformation to reduce skewness in specified columns
    for col in LOG_TRANSFORM_COLS:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: np.log1p(x))
    return df

In [5]:
def clean_data(path):
    df = load_data(path)

    # Rename columns for consistency
    df = df.rename(columns={'ID': 'id', 'Gender': 'gender', 'Credit_Worthiness': 'credit_worthiness', 
                        'Interest_rate_spread': 'interest_rate_spread', 'Upfront_charges': 'upfront_charges', 
                       'Neg_ammortization': 'neg_ammortization', 'Secured_by': 'secured_by', 
                       'Credit_Score': 'credit_score', 'LTV': 'ltv', 'Region': 'region',
                       'Security_Type': 'security_type', 'Status': 'status'})

    # Drop features with high missingness or potential data leakage
    df = df.drop(columns=[col for col in DROP_COLS if col in df.columns])

    df = impute_missing_values(df)
    df = log_transform(df)
    return df

In [6]:
# Execute cleaning pipeline
input_path = "../data/Loan_Default.csv"
output_path = "../data/cleaned_loan_data.csv"

cleaned_df = clean_data(input_path)
cleaned_df.to_csv(output_path, index=False)

print(f"Cleaned data saved to {output_path}")
print(f"Shape: {cleaned_df.shape}")
print(f"Missing values: {cleaned_df.isnull().sum().sum()}")

Cleaned data saved to ../data/cleaned_loan_data.csv
Shape: (148670, 31)
Missing values: 0
