In [1]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker
fake = Faker()

# Set seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 5000

# Generate synthetic data
data = {
    "Customer_ID": [fake.uuid4() for _ in range(num_samples)],
    "Age": np.random.randint(18, 75, num_samples),
    "Gender": np.random.choice(["Male", "Female"], num_samples),
    "Annual_Income": np.random.randint(20000, 150000, num_samples),
    "Loan_Amount": np.random.randint(1000, 50000, num_samples),
    "Credit_Score": np.random.randint(300, 850, num_samples),
    "Late_Payments": np.random.randint(0, 10, num_samples),
    "Loan_Term": np.random.choice([12, 24, 36, 48, 60], num_samples),
    "Interest_Rate": np.round(np.random.uniform(3.5, 25, num_samples), 2),
    "Loan_Type": np.random.choice(["Personal", "Mortgage", "Auto", "Education"], num_samples),
}

# Define the target variable (Default: 1 = Default, 0 = No Default)
data["Default"] = np.where(
    (data["Credit_Score"] < 600) & (data["Late_Payments"] > 3) & (data["Annual_Income"] < 40000),
    1,  # Default
    0   # No Default
)

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV (Optional)
df.to_csv("synthetic_credit_risk.csv", index=False)

# Display first rows
df.head()


Unnamed: 0,Customer_ID,Age,Gender,Annual_Income,Loan_Amount,Credit_Score,Late_Payments,Loan_Term,Interest_Rate,Loan_Type,Default
0,1623a5e1-819e-4e72-bd64-d8158c377f16,56,Male,61115,37735,739,7,12,21.49,Mortgage,0
1,644c27b4-82a9-4b41-819e-e958a041cf1f,69,Male,130084,8423,648,8,24,11.51,Mortgage,0
2,e51bb10e-a5c4-46d4-928c-ddb14e297b02,46,Female,116887,40218,711,9,36,13.29,Mortgage,0
3,f949623d-f8a6-4113-ae1c-d8110b7a9b8d,32,Female,103732,13213,827,9,24,18.52,Personal,0
4,c02f2154-fb1c-42f9-a432-992541a45408,60,Male,132145,36983,609,1,36,4.96,Auto,0
