<a href="https://colab.research.google.com/github/slimycashcodes/Pre-Delinquency-Risk-Prediction-Engine/blob/main/Barclays.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sdv pandas numpy scikit-learn

Collecting sdv
  Downloading sdv-1.34.1-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.42.54-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.42.54-py3-none-any.whl.metadata (5.9 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.14.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ctgan>=0.11.1 (from sdv)
  Downloading ctgan-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting rdt>=1.18.2 (from sdv)
  Downloading rdt-1.20.0-py3-none-any.whl.metadata (11 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.27.1-py3-none-any.whl.metadata (10.0 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.1.0-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading 

In [2]:
import pandas as pd
import numpy as np
import random

np.random.seed(42)

NUM_CUSTOMERS = 5000

customers = []

for i in range(NUM_CUSTOMERS):
    income = np.random.normal(60000, 15000)
    age = np.random.randint(23, 60)

    risk_score = (
        0.3 * (income < 45000) +
        0.3 * (age < 25) +
        0.4 * np.random.rand()
    )

    customers.append([
        f"CUST{i:05d}",
        age,
        max(25000, int(income)),
        round(risk_score, 3)
    ])

customers_df = pd.DataFrame(customers, columns=[
    "customer_id",
    "age",
    "monthly_income",
    "base_risk_score"
])

customers_df.to_csv("customers.csv", index=False)


In [3]:
loans = []

for _, row in customers_df.iterrows():

    loan_amount = row["monthly_income"] * random.randint(6, 12)
    approval = 1 if row["base_risk_score"] < 0.7 else 0

    default_prob = row["base_risk_score"] + np.random.normal(0, 0.1)
    default_flag = 1 if default_prob > 0.75 and approval == 1 else 0

    loans.append([
        row["customer_id"],
        loan_amount,
        approval,
        default_flag
    ])

loan_df = pd.DataFrame(loans, columns=[
    "customer_id",
    "loan_amount",
    "loan_approved",
    "loan_defaulted"
])

loan_df.to_csv("loan_data.csv", index=False)


In [4]:
cc_data = []

for _, row in customers_df.iterrows():

    credit_limit = row["monthly_income"] * random.randint(2,4)
    utilization = np.random.beta(2,5)
    monthly_spend = credit_limit * utilization

    next_month_default = 1 if utilization > 0.8 and row["base_risk_score"] > 0.6 else 0

    cc_data.append([
        row["customer_id"],
        credit_limit,
        round(utilization,2),
        int(monthly_spend),
        next_month_default
    ])

cc_df = pd.DataFrame(cc_data, columns=[
    "customer_id",
    "credit_limit",
    "utilization_ratio",
    "monthly_spend",
    "cc_default_next_month"
])

cc_df.to_csv("credit_card_data.csv", index=False)


In [5]:
from datetime import datetime, timedelta

transactions = []
START = datetime(2023,1,1)
END = datetime(2024,12,31)

for _, cust in customers_df.iterrows():

    balance = cust["monthly_income"] * np.random.uniform(0.5,1.5)
    stress = cust["base_risk_score"] > 0.6

    date = START

    while date <= END:

        if date.day == 1:
            delay = random.randint(0,5) if stress else 0
            salary_date = date + timedelta(days=delay)
            balance += cust["monthly_income"]

            transactions.append([
                cust["customer_id"],
                salary_date,
                "CREDIT",
                cust["monthly_income"],
                "Salary",
                balance
            ])

        if random.random() < 0.6:
            spend = np.random.randint(500,3000)

            if stress:
                if random.random() < 0.3:
                    spend = np.random.randint(3000,12000)

            if balance > spend:
                balance -= spend

                transactions.append([
                    cust["customer_id"],
                    date,
                    "DEBIT",
                    spend,
                    "Expense",
                    balance
                ])

        date += timedelta(days=1)

txn_df = pd.DataFrame(transactions, columns=[
    "customer_id",
    "date",
    "transaction_type",
    "amount",
    "category",
    "balance_after"
])

txn_df.to_csv("transactions_data.csv", index=False)


In [6]:
!pip install sdv



In [7]:
import pandas as pd
import random
from datetime import datetime, timedelta

NUM_CUSTOMERS = 2000

customers = []
loans = []
credit_cards = []
transactions = []

start_date = datetime(2022,1,1)

for i in range(NUM_CUSTOMERS):
    cid = f"CUST{i:05d}"
    income = random.randint(40000,90000)
    risk_score = random.randint(500,800)

    customers.append([cid, income, risk_score])

    loan_amount = income * random.randint(5,12)
    tenure = 24
    emi = loan_amount/tenure
    defaulted = 1 if risk_score < 580 else 0

    loans.append([cid, loan_amount, tenure, emi, defaulted])

    limit_amt = income * 2
    utilization = random.uniform(0.1,0.9)
    cc_default_next = 1 if utilization > 0.8 and risk_score < 600 else 0

    credit_cards.append([cid, limit_amt, utilization, cc_default_next])

    balance = income
    for d in range(120):
        date = start_date + timedelta(days=d)

        if date.day == 1:
            balance += income
            transactions.append([cid, str(date), "CREDIT", income, "Salary", balance])

        spend = random.randint(500,4000)
        if balance > spend:
            balance -= spend
            transactions.append([cid, str(date), "DEBIT", spend, "Expense", balance])

customers_df = pd.DataFrame(customers, columns=["customer_id","income","risk_score"])
loan_df = pd.DataFrame(loans, columns=["customer_id","loan_amount","tenure","emi","loan_default"])
cc_df = pd.DataFrame(credit_cards, columns=["customer_id","cc_limit","utilization","cc_default_next"])
txn_df = pd.DataFrame(transactions, columns=["customer_id","date","type","amount","category","balance_after"])
txn_df['date'] = txn_df['date'].astype(str)



In [8]:
from sdv.metadata import MultiTableMetadata

metadata = MultiTableMetadata()

metadata.detect_table_from_dataframe('customers', customers_df)
metadata.detect_table_from_dataframe('loans', loan_df)
metadata.detect_table_from_dataframe('credit_cards', cc_df)
metadata.detect_table_from_dataframe('transactions', txn_df)

metadata.set_primary_key('customers', 'customer_id')

metadata.set_primary_key('loans', None)
metadata.set_primary_key('credit_cards', None)
metadata.set_primary_key('transactions', None)

metadata.add_relationship(
    parent_table_name='customers',
    child_table_name='loans',
    parent_primary_key='customer_id',
    child_foreign_key='customer_id'
)

metadata.add_relationship(
    parent_table_name='customers',
    child_table_name='credit_cards',
    parent_primary_key='customer_id',
    child_foreign_key='customer_id'
)

metadata.add_relationship(
    parent_table_name='customers',
    child_table_name='transactions',
    parent_primary_key='customer_id',
    child_foreign_key='customer_id'
)




In [9]:
from sdv.multi_table import HMASynthesizer

synthesizer = HMASynthesizer(metadata)
synthesizer.fit({
    'customers': customers_df,
    'loans': loan_df,
    'credit_cards': cc_df,
    'transactions': txn_df
})


Preprocess Tables: 100%|██████████| 4/4 [00:06<00:00,  1.66s/it]



Learning relationships:


(1/3) Tables 'customers' and 'loans' ('customer_id'): 100%|██████████| 2000/2000 [00:20<00:00, 97.83it/s] 
(2/3) Tables 'customers' and 'transactions' ('customer_id'): 100%|██████████| 2000/2000 [00:21<00:00, 94.82it/s]
(3/3) Tables 'customers' and 'credit_cards' ('customer_id'): 100%|██████████| 2000/2000 [00:19<00:00, 102.27it/s]





Modeling Tables: 100%|██████████| 4/4 [00:00<00:00,  5.83it/s]


In [10]:
synthetic_data = synthesizer.sample(scale=1)  # 2x bigger dataset

synthetic_customers = synthetic_data['customers']
synthetic_loans = synthetic_data['loans']
synthetic_cc = synthetic_data['credit_cards']
synthetic_txn = synthetic_data['transactions']


In [11]:
synthetic_txn['date'] = pd.to_datetime(synthetic_txn['date'])
synthetic_txn['year_month'] = synthetic_txn['date'].dt.to_period('M')

monthly_features = synthetic_txn.groupby(
    ['customer_id','year_month']
).agg({
    'amount':['sum','mean','std'],
    'balance_after':'min'
}).reset_index()

monthly_features.columns = [
    'customer_id',
    'year_month',
    'total_spend',
    'avg_spend',
    'spend_volatility',
    'min_balance'
]

final_dataset = (
    synthetic_customers
    .merge(synthetic_loans, on='customer_id')
    .merge(synthetic_cc, on='customer_id')
    .merge(monthly_features, on='customer_id')
)

final_dataset.to_csv("final_training_dataset.csv", index=False)

print("Final dataset shape:", final_dataset.shape)


Final dataset shape: (8000, 15)


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

TARGET = "loan_default"


import pandas as pd

df = pd.read_csv("final_training_dataset.csv")

drop_cols = ['customer_id', 'year_month']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

df = df.fillna(0)

print("Dataset Loaded. Shape:", df.shape)

X = df.select_dtypes(include=["int64", "float64"]).drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

Dataset Loaded. Shape: (8000, 13)

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.98      0.90      1546
           1       0.85      0.30      0.45       454

    accuracy                           0.83      2000
   macro avg       0.84      0.64      0.67      2000
weighted avg       0.83      0.83      0.80      2000

ROC-AUC: 0.87266984288002
