In [1]:
import random
import pandas as pd
from datetime import datetime

def generate_cambodian_loan_data(n=100):
    """
    Generate synthetic loan application data for Cambodia,
    including risk evaluation.
    """
    random.seed(42)  # For reproducibility

    # ---------- PROVINCE & REGION ----------
    province_choices = [
        "Phnom Penh",   # 45%
        "Kandal",       # 8%
        "Siem Reap",    # 7%
        "Svay Rieng",   # 4%
        "Kampong Speu", # 4%
        "Kampong Thom", # 3%
        "Battambang",   # 4%
        "Others"        # 25%
    ]
    province_probs = [0.45, 0.08, 0.07, 0.04, 0.04, 0.03, 0.04, 0.25]

    def get_region_type(province):
        if province == "Phnom Penh":
            return "Urban"
        elif province in ["Kandal", "Siem Reap", "Battambang"]:
            return "Semi-Urban"
        else:
            return "Rural"

    # ---------- EMPLOYMENT & INCOME ----------
    def pick_employment_type(region_type):
        if region_type == "Urban":
            return random.choices(
                ["Salaried", "Self-Employed", "Farmer", "Student", "Unemployed"],
                weights=[0.55, 0.30, 0.05, 0.05, 0.05]
            )[0]
        elif region_type == "Semi-Urban":
            return random.choices(
                ["Salaried", "Self-Employed", "Farmer", "Student", "Unemployed"],
                weights=[0.40, 0.35, 0.15, 0.05, 0.05]
            )[0]
        else:  # Rural
            return random.choices(
                ["Farmer", "Salaried", "Self-Employed", "Student", "Unemployed"],
                weights=[0.50, 0.25, 0.10, 0.10, 0.05]
            )[0]

    def pick_annual_income(age, region_type, employment_type):
        base_ranges = {
            "Urban": {
                "Salaried": (3600, 24000),
                "Self-Employed": (2400, 18000),
                "Farmer": (1800, 6000),
                "Student": (600, 2400),
                "Unemployed": (0, 1200)
            },
            "Semi-Urban": {
                "Salaried": (2400, 12000),
                "Self-Employed": (1800, 9000),
                "Farmer": (1500, 4800),
                "Student": (600, 1800),
                "Unemployed": (0, 900)
            },
            "Rural": {
                "Salaried": (1800, 7200),
                "Self-Employed": (1200, 4800),
                "Farmer": (1200, 3600),
                "Student": (300, 1200),
                "Unemployed": (0, 600)
            }
        }
        
        low, high = base_ranges[region_type][employment_type]
        
        # Age adjustments
        if 25 <= age <= 45:
            high = int(high * 1.2)
        elif age > 45:
            high = int(high * 0.9)
            
        return random.randint(low, high)



    # ---------- LOAN TYPES & AMOUNTS ----------
    def pick_loan_type(region_type, employment_type, annual_income):
        if region_type == "Urban":
            choices = ["House", "Car", "Moto", "Personal", "SME", "Agriculture"]
            if employment_type == "Farmer":
                weights = [0.05, 0.05, 0.30, 0.20, 0.10, 0.30]
            elif employment_type in ["Salaried", "Self-Employed"]:
                weights = [0.25, 0.15, 0.20, 0.20, 0.15, 0.05]
            else:
                weights = [0.05, 0.05, 0.40, 0.40, 0.05, 0.05]
        
        elif region_type == "Semi-Urban":
            choices = ["House", "Car", "Moto", "Personal", "SME", "Agriculture"]
            if employment_type == "Farmer":
                weights = [0.05, 0.05, 0.25, 0.15, 0.10, 0.40]
            elif employment_type in ["Salaried", "Self-Employed"]:
                weights = [0.20, 0.10, 0.25, 0.20, 0.15, 0.10]
            else:
                weights = [0.05, 0.05, 0.45, 0.35, 0.05, 0.05]
        
        else:  # Rural
            choices = ["House", "Car", "Moto", "Personal", "SME", "Agriculture"]
            if employment_type == "Farmer":
                weights = [0.05, 0.02, 0.20, 0.13, 0.10, 0.50]
            elif employment_type in ["Salaried", "Self-Employed"]:
                weights = [0.15, 0.05, 0.30, 0.20, 0.15, 0.15]
            else:
                weights = [0.05, 0.02, 0.48, 0.35, 0.05, 0.05]

        return random.choices(choices, weights=weights)[0]

    def pick_loan_amount(loan_type, annual_income, region_type):
        # Base ranges for loan amounts
        base_ranges = {
            "House": {
                "Urban": (5000, 50000),
                "Semi-Urban": (3000, 30000),
                "Rural": (2000, 20000)
            },
            "Car": {
                "Urban": (3000, 20000),
                "Semi-Urban": (2000, 15000),
                "Rural": (1500, 10000)
            },
            "Moto": {
                "Urban": (500, 3000),
                "Semi-Urban": (400, 2500),
                "Rural": (300, 2000)
            },
            "Personal": {
                "Urban": (300, 5000),
                "Semi-Urban": (200, 3000),
                "Rural": (100, 2000)
            },
            "SME": {
                "Urban": (2000, 30000),
                "Semi-Urban": (1500, 20000),
                "Rural": (1000, 15000)
            },
            "Agriculture": {
                "Urban": (500, 10000),
                "Semi-Urban": (500, 15000),
                "Rural": (300, 20000)
            }
        }

        min_amt, max_amt = base_ranges[loan_type][region_type]
        
        # Income-based adjustments
        max_amt = min(max_amt, annual_income * 5)
        
        # Ensure max_amt is not less than min_amt
        max_amt = max(max_amt, min_amt)
        
        return random.randint(min_amt, max_amt)

    # ---------- LOAN TERMS & RATES ----------
    loan_terms = {
        "House": {"min": 5, "max": 25},
        "Car": {"min": 2, "max": 6},
        "Moto": {"min": 1, "max": 3},
        "Personal": {"min": 1, "max": 4},
        "SME": {"min": 1, "max": 7},
        "Agriculture": {"min": 1, "max": 3}
    }

    interest_rates = {
        "House": {"min": 9.6, "max": 18.0},
        "Car": {"min": 12.0, "max": 18.0},
        "Moto": {"min": 24.0, "max": 36.0},
        "Personal": {"min": 18.0, "max": 30.0},
        "SME": {"min": 18.0, "max": 30.0},
        "Agriculture": {"min": 21.6, "max": 36.0}
    }



    # ---------- RISK EVALUATION ----------
    def evaluate_loan_risk(record):
        """
        Evaluate loan risk specifically for Cambodian context
        Returns: 'Low Risk', 'Medium Risk', or 'High Risk'
        """
        risk_score = 0
        
        # Employment and Income Stability (0-25 points)
        emp_scores = {
            "Salaried": 0,
            "Self-Employed": 8,
            "Farmer": 10,
            "Student": 20,
            "Unemployed": 25
        }
        risk_score += emp_scores[record["Employment Type"]]
        
        # Loan Size vs Income (0-20 points)
        ratio = record["Loan Amount (USD)"] / record["Annual Income (USD)"] if record["Annual Income (USD)"] > 0 else float('inf')
        if ratio <= 1.0:
            risk_score += 0
        elif ratio <= 2.0:
            risk_score += 8
        elif ratio <= 3.0:
            risk_score += 15
        else:
            risk_score += 20
        
        # Loan Purpose Risk (0-15 points)
        purpose_scores = {
            "House": 0,
            "Agriculture": 5,
            "SME": 8,
            "Moto": 10,
            "Car": 12,
            "Personal": 15
        }
        risk_score += purpose_scores[record["Loan Type"]]
        
        # DTI Impact (0-15 points)
        dti = record["DTI (New Loan Only)"]
        if dti <= 0.35:
            risk_score += 0
        elif dti <= 0.45:
            risk_score += 5
        elif dti <= 0.55:
            risk_score += 10
        else:
            risk_score += 15
        
        # Credit History (0-15 points)
        credit_scores = {
            "Good": 0,
            "Fair": 5,
            "Poor": 12,
            "No History": 8
        }
        risk_score += credit_scores[record["Credit History"]]
        
        # Region (0-10 points)
        region_scores = {
            "Urban": 0,
            "Semi-Urban": 5,
            "Rural": 10
        }
        risk_score += region_scores[record["Region Type"]]
        
        # Adjustments
        if record["Collateral (USD)"] >= record["Loan Amount (USD)"]:
            risk_score -= 5
            
        if record["Savings/Assets (USD)"] >= record["Annual Income (USD)"]:
            risk_score -= 5
            
        if record["Loan Type"] == "Agriculture" and record["Region Type"] == "Rural":
            risk_score -= 5
            
        if record["Loan Type"] == "Moto" and record["Employment Type"] in ["Salaried", "Self-Employed"]:
            risk_score -= 3
            
        if record["Existing Debt (USD)"] > 0.5 * record["Annual Income (USD)"]:
            risk_score += 5
            
        if record["Age"] < 25 or record["Age"] > 60:
            risk_score += 5
            
        risk_score = max(0, min(100, risk_score))
        
        if risk_score <= 35:
            return "Low Risk"
        elif risk_score <= 65:
            return "Medium Risk"
        else:
            return "High Risk"




    # ---------- GENERATE DATA ----------
    records = []
    
    for i in range(1, n + 1):
        # Basic demographics
        province = random.choices(province_choices, weights=province_probs)[0]
        region_type = get_region_type(province)
        age = random.randint(18, 65)
        gender = random.choices(["Male", "Female"], weights=[0.47, 0.53])[0]
        
        # Employment and Income
        employment_type = pick_employment_type(region_type)
        annual_income = pick_annual_income(age, region_type, employment_type)
        
        # Credit History
        credit_history = random.choices(
            ["Good", "Fair", "Poor", "No History"],
            weights=[0.45, 0.35, 0.15, 0.05]
        )[0]
        
        # Existing Debt
        has_existing_debt = random.choices([True, False], weights=[0.35, 0.65])[0]
        if has_existing_debt:
            max_debt = max(100, int(annual_income * 1.2))
            existing_debt = random.randint(100, max_debt)
        else:
            existing_debt = 0
        
        # Savings/Assets
        max_savings = max(100, int(annual_income * 2))
        savings = random.randint(0, max_savings)
        
        # Loan Details
        loan_type = pick_loan_type(region_type, employment_type, annual_income)
        loan_amount = pick_loan_amount(loan_type, annual_income, region_type)
        
        # Loan Term and Rate
        loan_term = random.randint(loan_terms[loan_type]["min"], loan_terms[loan_type]["max"])
        interest_rate = round(random.uniform(
            interest_rates[loan_type]["min"],
            interest_rates[loan_type]["max"]
        ), 2)
        
        # Calculate DTI
        r = interest_rate / 100.0
        if r > 0:
            numerator = r * (1 + r) ** loan_term
            denominator = (1 + r) ** loan_term - 1
            annual_payment = loan_amount * (numerator / denominator)
        else:
            annual_payment = loan_amount / loan_term
        
        dti = annual_payment / annual_income if annual_income > 0 else float('inf')
        
        # Collateral
        collateral_value = random.randint(
            int(loan_amount * 0.5),
            int(loan_amount * 1.2)
        ) if loan_type in ["House", "Car"] else random.randint(0, loan_amount)




        record = {
            "ID": i,
            "Province": province,
            "Region Type": region_type,
            "Age": age,
            "Gender": gender,
            "Employment Type": employment_type,
            "Annual Income (USD)": annual_income,
            "Credit History": credit_history,
            "Existing Debt (USD)": existing_debt,
            "Savings/Assets (USD)": savings,
            "Loan Type": loan_type,
            "Loan Amount (USD)": loan_amount,
            "Loan Term (Years)": loan_term,
            "Annual Interest Rate (%)": interest_rate,
            "DTI (New Loan Only)": round(dti, 3),
            "Collateral (USD)": collateral_value
        }
        
        # Add risk evaluation
        record["Risk Category"] = evaluate_loan_risk(record)
        
        # Determine loan approval based on risk category and other factors
        def determine_approval(record):
            risk_cat = record["Risk Category"]
            base_approval_chances = {
                "Low Risk": 0.85,    # 85% approval for low risk
                "Medium Risk": 0.45,  # 45% approval for medium risk
                "High Risk": 0.15    # 15% approval for high risk
            }
            
            base_chance = base_approval_chances[risk_cat]
            
            # Adjust approval chance based on various factors
            adjustments = 0
            
            # Positive adjustments
            if record["Collateral (USD)"] >= record["Loan Amount (USD)"]:
                adjustments += 0.1
            if record["Credit History"] == "Good":
                adjustments += 0.1
            if record["Employment Type"] == "Salaried":
                adjustments += 0.05
            if record["Loan Type"] == "Agriculture" and record["Region Type"] == "Rural":
                adjustments += 0.05
            
            # Negative adjustments
            if record["DTI (New Loan Only)"] > 0.5:
                adjustments -= 0.1
            if record["Credit History"] == "Poor":
                adjustments -= 0.15
            if record["Employment Type"] == "Unemployed":
                adjustments -= 0.2
            
            final_chance = min(0.95, max(0.05, base_chance + adjustments))
            return "Approved" if random.random() < final_chance else "Rejected"
        
        record["Loan Status"] = determine_approval(record)
        records.append(record)

    # Create DataFrame
    df = pd.DataFrame(records)
    
    # Calculate and print summary statistics
    total_loans = len(df)
    approved_loans = len(df[df["Loan Status"] == "Approved"])
    approval_rate = (approved_loans / total_loans) * 100
    
    risk_distribution = df["Risk Category"].value_counts()
    approval_by_risk = df[df["Loan Status"] == "Approved"]["Risk Category"].value_counts()
    
    print("\nSummary Statistics:")
    print(f"Total Applications: {total_loans}")
    print(f"Overall Approval Rate: {approval_rate:.1f}%")
    print("\nRisk Distribution:")
    for risk_cat in ["Low Risk", "Medium Risk", "High Risk"]:
        count = risk_distribution.get(risk_cat, 0)
        approved = approval_by_risk.get(risk_cat, 0)
        print(f"{risk_cat}: {count} applications, {approved} approved ({approved/count*100:.1f}% approval rate)")
    
    return df



# Function to save data and generate basic analysis
def generate_and_save_loan_data(n_samples=1000, filename="cambodian_loan_data.csv"):
    """
    Generate loan data, save to CSV, and print analysis
    """
    # Generate data
    print(f"Generating {n_samples} loan applications...")
    df = generate_cambodian_loan_data(n=n_samples)
    
    # Save to CSV
    df.to_csv(filename, index=False)
    print(f"\nData saved to {filename}")
    
    # Additional Analysis
    print("\nDetailed Analysis:")
    
    print("\nLoan Type Distribution:")
    loan_type_dist = df["Loan Type"].value_counts()
    for loan_type, count in loan_type_dist.items():
        print(f"{loan_type}: {count} ({count/len(df)*100:.1f}%)")
    
    print("\nRisk Distribution by Region:")
    risk_by_region = pd.crosstab(df["Region Type"], df["Risk Category"], normalize="index") * 100
    print(risk_by_region.round(1))
    
    print("\nApproval Rates by Employment Type:")
    emp_approval = pd.crosstab(df["Employment Type"], df["Loan Status"], normalize="index") * 100
    print(emp_approval.round(1))
    
    return df

if __name__ == "__main__":
    # Generate 100 loan applications
    df = generate_and_save_loan_data(n_samples=40000)


Generating 40000 loan applications...

Summary Statistics:
Total Applications: 40000
Overall Approval Rate: 64.3%

Risk Distribution:
Low Risk: 21672 applications, 19516 approved (90.1% approval rate)
Medium Risk: 14935 applications, 5996 approved (40.1% approval rate)
High Risk: 3393 applications, 204 approved (6.0% approval rate)

Data saved to cambodian_loan_data.csv

Detailed Analysis:

Loan Type Distribution:
Moto: 9963 (24.9%)
Personal: 8232 (20.6%)
Agriculture: 6712 (16.8%)
House: 6574 (16.4%)
SME: 4960 (12.4%)
Car: 3559 (8.9%)

Risk Distribution by Region:
Risk Category  High Risk  Low Risk  Medium Risk
Region Type                                    
Rural               14.0      30.8         55.2
Semi-Urban           6.8      55.0         38.3
Urban                4.8      72.6         22.6

Approval Rates by Employment Type:
Loan Status      Approved  Rejected
Employment Type                    
Farmer               51.5      48.5
Salaried             81.5      18.5
Self-Empl

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# ==============================
# 1️⃣ Load & Inspect Data
# ==============================
df = pd.read_csv("cambodian_loan_data.csv")  # Ensure correct dataset filename

# Ensure correct column names for target variables
risk_col = "Risk Category"  # Risk levels: Low, Medium, High
target_col = "Loan Status"  # "Approved" / "Rejected"

# ==============================
# 2️⃣ Feature Selection & Encoding
# ==============================
feature_cols = [
    "Province", "Region Type", "Age", "Gender", "Employment Type",
    "Annual Income (USD)", "Credit History", "Existing Debt (USD)",
    "Savings/Assets (USD)", "Loan Type", "Loan Amount (USD)",
    "Loan Term (Years)", "Annual Interest Rate (%)", "Collateral (USD)"
]

numeric_cols = [
    "Age", "Annual Income (USD)", "Existing Debt (USD)",
    "Savings/Assets (USD)", "Loan Amount (USD)", "Loan Term (Years)",
    "Annual Interest Rate (%)", "Collateral (USD)"
]

categorical_cols = ["Province", "Region Type", "Gender", "Employment Type", "Credit History", "Loan Type"]

# ==============================
# 3️⃣ Define Preprocessing Steps
# ==============================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# ==============================
# 4️⃣ Pipelines for Models
# ==============================
# Pipeline for Risk Prediction
risk_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)),
])

# Pipeline for Loan Approval Prediction
loan_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)),
])

# ==============================
# 5️⃣ Prepare Data
# ==============================
X_risk = df[feature_cols].copy()
y_risk = df[risk_col]  # Risk Category Labels

X_loan = df[feature_cols].copy()
y_loan = df[target_col].apply(lambda x: 1 if x == "Approved" else 0)  # Encode Loan Status

# Train/test split
X_risk_train, X_risk_test, y_risk_train, y_risk_test = train_test_split(
    X_risk, y_risk, test_size=0.20, random_state=42, stratify=y_risk
)

X_loan_train, X_loan_test, y_loan_train, y_loan_test = train_test_split(
    X_loan, y_loan, test_size=0.20, random_state=42, stratify=y_loan
)

# ==============================
# 6️⃣ Train Models
# ==============================
# Train Risk Prediction Model
risk_pipeline.fit(X_risk_train, y_risk_train)

# Train Loan Approval Model
loan_pipeline.fit(X_loan_train, y_loan_train)

# ==============================
# 7️⃣ Evaluate Models
# ==============================
# Evaluate Risk Model
y_risk_pred = risk_pipeline.predict(X_risk_test)
print("\n=== 🔹 Risk Prediction Confusion Matrix ===")
print(confusion_matrix(y_risk_test, y_risk_pred))

print("\n=== 🔹 Risk Prediction Report ===")
print(classification_report(y_risk_test, y_risk_pred, target_names=["Low Risk", "Medium Risk", "High Risk"]))

# Evaluate Loan Approval Model
y_loan_pred = loan_pipeline.predict(X_loan_test)
print("\n=== 🔹 Loan Approval Confusion Matrix ===")
print(confusion_matrix(y_loan_test, y_loan_pred))

print("\n=== 🔹 Loan Approval Report ===")
print(classification_report(y_loan_test, y_loan_pred, target_names=["Rejected", "Approved"]))

# ==============================
# 8️⃣ Save Pipelines
# ==============================
dump(risk_pipeline, "risk_prediction_pipeline.joblib")
dump(loan_pipeline, "loan_approval_pipeline.joblib")
print("\n✅ Pipelines saved successfully!")



=== 🔹 Risk Prediction Confusion Matrix ===
[[ 517    0  162]
 [   0 4046  288]
 [  85  368 2534]]

=== 🔹 Risk Prediction Report ===
              precision    recall  f1-score   support

    Low Risk       0.86      0.76      0.81       679
 Medium Risk       0.92      0.93      0.93      4334
   High Risk       0.85      0.85      0.85      2987

    accuracy                           0.89      8000
   macro avg       0.87      0.85      0.86      8000
weighted avg       0.89      0.89      0.89      8000


=== 🔹 Loan Approval Confusion Matrix ===
[[1537 1320]
 [ 561 4582]]

=== 🔹 Loan Approval Report ===
              precision    recall  f1-score   support

    Rejected       0.73      0.54      0.62      2857
    Approved       0.78      0.89      0.83      5143

    accuracy                           0.76      8000
   macro avg       0.75      0.71      0.73      8000
weighted avg       0.76      0.76      0.75      8000


✅ Pipelines saved successfully!


In [3]:
# For testing

from joblib import load
import pandas as pd

# Load saved pipelines
risk_pipeline = load("risk_prediction_pipeline.joblib")
loan_pipeline = load("loan_approval_pipeline.joblib")

# Define a sample for prediction
sample = pd.DataFrame({
    "Province": ["Kandal"],
    "Region Type": ["Semi-Urban"],
    "Age": [43],
    "Gender": ["Female"],
    "Employment Type": ["Salaried"],
    "Annual Income (USD)": [900],
    "Credit History": ["Bad"],
    "Existing Debt (USD)": [0],
    "Savings/Assets (USD)": [655],
    "Loan Type": ["House"],
    "Loan Amount (USD)": [10775],
    "Loan Term (Years)": [3],
    "Annual Interest Rate (%)": [14.23],
    "Collateral (USD)": [8207]
})

# Define prediction function
def predict_sample(sample, risk_model, loan_model):
    """Predict risk category and loan status for a single sample."""
    risk_pred = risk_model.predict(sample)
    loan_pred = loan_model.predict(sample)

    return {
        "Risk Prediction": risk_pred[0],
        "Loan Status": "Approved" if loan_pred[0] == 1 else "Rejected"
    }

# Perform prediction on the sample
result = predict_sample(sample, risk_pipeline, loan_pipeline)
result


FileNotFoundError: [Errno 2] No such file or directory: 'risk_prediction_pipeline.joblib'