In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

try:
    df = pd.read_csv('../data/raw/credit_risk_dataset_v1.csv')

    # 1. FIX THE NAMES: Mapping names to the actual content seen in your screenshot
    # Column 0: customer_id
    # Column 1: banked_flag
    # Column 2: job_category
    # Column 3: housing_type (this is where 'own', 'free' are)
    
    df.columns = [
        'customer_id', 'banked_flag', 'job_category', 'housing_type', 'dependents',
        'employment_years', 'residence_years', 'loan_amount', 'duration_months',
        'interest_rate', 'existing_monthly_obligation', 'average_monthly_inflow',
        'average_monthly_outflow', 'emi', 'net_monthly_surplus', 'emi_coverage_ratio',
        'loan_to_income_ratio', 'existing_obligation_ratio', 'dependent_ratio',
        'employment_stability_score', 'income_variance_index', 'residence_stability_score',
        'digital_behavior_score', 'credit_risk'
    ]

    # 2. ENCODE THE CORRECT COLUMN
    le = LabelEncoder()
    df['housing_type'] = le.fit_transform(df['housing_type'].astype(str))

    # 3. CLEANUP
    df.set_index('customer_id', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

    print("✅ ALIGNMENT FIXED")
    display(df.head())

except Exception as e:
    print(f"❌ Error: {e}")

✅ ALIGNMENT FIXED


Unnamed: 0_level_0,banked_flag,job_category,housing_type,dependents,employment_years,residence_years,loan_amount,duration_months,interest_rate,existing_monthly_obligation,...,net_monthly_surplus,emi_coverage_ratio,loan_to_income_ratio,existing_obligation_ratio,dependent_ratio,employment_stability_score,income_variance_index,residence_stability_score,digital_behavior_score,credit_risk
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CUST001,1,2,1,2,6,12,1169,6,0.1,300,...,1600,8.0,0.04,0.06,0.5,0.6,0.12,1.0,0.78,0
CUST002,1,2,1,0,3,4,5951,48,0.12,500,...,1100,7.05,0.12,0.12,0.0,0.3,0.18,0.4,0.65,0
CUST003,1,1,1,3,2,10,2096,12,0.11,250,...,700,3.78,0.06,0.09,0.6,0.2,0.22,1.0,0.52,1
CUST004,1,2,0,2,5,3,7882,42,0.14,800,...,1200,4.61,0.14,0.15,0.5,0.5,0.25,0.3,0.6,0
CUST005,1,2,0,1,7,2,4870,24,0.13,600,...,1300,5.65,0.08,0.11,0.33,0.7,0.2,0.2,0.7,0


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Calculate correlations
plt.figure(figsize=(15, 10))
correlation_matrix = df.corr()

# 2. Plot Heatmap
# We look for features that have high correlation (red/blue) with 'credit_risk'
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title("Credit Risk Driver Map (Correlation Analysis)")
plt.show()

# 3. Print the top 5 most important drivers of Risk
print("--- Top Drivers of Credit Risk ---")
risk_drivers = correlation_matrix['credit_risk'].sort_values(ascending=False)
print(risk_drivers.head(6)) # Features that increase risk
print("\n--- Top Features that REDUCE Risk ---")
print(risk_drivers.tail(5)) # Features that mean a safer customer

In [3]:
# Run this at the end of your cleaning notebook
df.to_csv('../data/processed/cleaned_credit_data.csv', index=True)
print("✅ Cleaned data saved!")

✅ Cleaned data saved!


In [None]:
# Run this once to install everything in your environment
! pip install pandas numpy matplotlib seaborn scikit-learn xgboost imbalanced-learn shap os