### Generating Fake Data

In [1]:
import random
import datetime
import pandas as pd

def generate_fake_user(i):
    savings = [random.randint(10000, 100000) for _ in range(2)]
    expenditure = [random.randint(20000, 100000) for _ in range(2)]
    
    savings_accounts = [{"bank_name": random.choice(["SBI", "HDFC", "ICICI", "Axis"]), 
                         "balance": random.randint(10000, 100000)} for _ in range(random.randint(1, 3))]
    
    current_accounts = [{"bank_name": random.choice(["SBI", "HDFC", "ICICI", "Axis"]), 
                         "balance": random.randint(5000, 50000)} for _ in range(random.randint(1, 2))]
    
    investments = [{"stock": random.choice(["RELIANCE", "TCS", "INFY", "HDFC"]), 
                    "quantity": random.randint(1, 20), 
                    "value": random.randint(1000, 50000)} for _ in range(random.randint(0, 3))]
    
    loans = [{"type": random.choice(["home", "education", "personal"]), 
              "amount": random.randint(50000, 1000000), 
              "emi": random.randint(1000, 20000)} for _ in range(random.randint(0, 2))]
    
    assets = [{"type": random.choice(["Land", "Real Estate", "Gold", "Vehicle"]), 
               "value": random.randint(50000, 5000000)} for _ in range(random.randint(0, 3))]
    
    job = {
        "company": random.choice(["Google", "Microsoft", "Amazon", "TCS"]),
        "designation": random.choice(["Software Developer", "Data Analyst", "Manager"]),
        "salary": random.randint(50000, 200000)
    }
    
    # Generate mock credit score as a target
    credit_score = random.randint(650, 800)
    
    return {
        "id": i,
        "savings": savings,
        "expenditure": expenditure,
        "savings_accounts": savings_accounts,
        "current_accounts": current_accounts,
        "investments": investments,
        "loans": loans,
        "assets": assets,
        "job": job,
        "credit_score": credit_score
    }

# Generate 50 users
fake_users = [generate_fake_user(i) for i in range(50)]

# Convert to DataFrame for training
def extract_features(user):
    return {
        "total_savings": sum(user["savings"]),
        "total_expenditure": sum(user["expenditure"]),
        "savings_rate": sum(user["savings"]) / max(sum(user["expenditure"]), 1),
        "num_savings_accounts": len(user["savings_accounts"]),
        "num_current_accounts": len(user["current_accounts"]),
        "total_account_balance": sum(acc["balance"] for acc in user["savings_accounts"] + user["current_accounts"]),
        "num_investments": len(user["investments"]),
        "total_investment": sum(inv["value"] for inv in user["investments"]),
        "num_loans": len(user["loans"]),
        "total_loan_amount": sum(loan["amount"] for loan in user["loans"]),
        "total_loan_emi": sum(loan["emi"] for loan in user["loans"]),
        "num_assets": len(user["assets"]),
        "total_asset_value": sum(asset["value"] for asset in user["assets"]),
        "salary": user["job"]["salary"],
        "credit_score": user["credit_score"]
    }

df = pd.DataFrame([extract_features(u) for u in fake_users])
print(df.head())


   total_savings  total_expenditure  savings_rate  num_savings_accounts  \
0         109715             154009      0.712393                     2   
1         140263              77480      1.810312                     2   
2         117746             127622      0.922615                     1   
3         160554             138659      1.157905                     3   
4         102403             175125      0.584742                     1   

   num_current_accounts  total_account_balance  num_investments  \
0                     1                 148632                0   
1                     2                 180971                2   
2                     2                  90446                3   
3                     1                 172204                0   
4                     1                  29132                0   

   total_investment  num_loans  total_loan_amount  total_loan_emi  num_assets  \
0                 0          1             290019            1191

### Training RandomForest and finding its MSE

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = df.drop(columns=["credit_score"])
y = df["credit_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)


Test MSE: 1120.9446700000008


### Applying HPO to RandomForest

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)
y_pred_best = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Test MSE: 1120.9446700000008


In [5]:
mse = mean_squared_error(y_test, y_pred_best)
print("Test MSE:", mse)

Test MSE: 1364.964656214141


### "Stacked Ensemble: Random Forest + Gradient Boosting with Linear Regression Meta-Model"

In [6]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

rf = RandomForestRegressor(n_estimators=300, max_depth=15, min_samples_leaf=2, random_state=42)
gb = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)

rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
from sklearn.linear_model import LinearRegression
import numpy as np

# Base predictions
rf_pred = rf.predict(X_train)
gb_pred = gb.predict(X_train)

# Stack features
stack_X = np.column_stack((rf_pred, gb_pred))
meta_model = LinearRegression()
meta_model.fit(stack_X, y_train)

# Predict on test
rf_test_pred = rf.predict(X_test)
gb_test_pred = gb.predict(X_test)
stack_X_test = np.column_stack((rf_test_pred, gb_test_pred))
final_pred = meta_model.predict(stack_X_test)
mse = mean_squared_error(y_test, final_pred)
print("Test MSE:", mse)


Test MSE: 1094.6302860067833


### Testing Stacked model with a data

In [None]:
sample_user = {
    "savings": [40000, 50000],
    "expenditure": [59999.99, 50000],
    "savings_accounts": [
        {"bank_name": "SBI", "balance": 50000}
    ],
    "current_accounts": [
        {"bank_name": "HDFC", "balance": 50000}
    ],
    "investments": [
        {"stock": "RELIANCE", "quantity": 2, "value": 2000}
    ],
    "loans": [
        {"type": "education", "amount": 500000, "emi": 5000}
    ],
    "assets": [
        {"type": "Land", "value": 1000000}
    ],
    "job": {
        "company": "Google",
        "designation": "Software Developer",
        "salary": 100000
    }
}

# Convert to model features
sample_features = {
    "total_savings": sum(sample_user["savings"]),
    "total_expenditure": sum(sample_user["expenditure"]),
    "savings_rate": sum(sample_user["savings"]) / sum(sample_user["expenditure"]),
    "num_savings_accounts": len(sample_user["savings_accounts"]),
    "num_current_accounts": len(sample_user["current_accounts"]),
    "total_account_balance": sum(acc["balance"] for acc in sample_user["savings_accounts"] + sample_user["current_accounts"]),
    "num_investments": len(sample_user["investments"]),
    "total_investment": sum(inv["value"] for inv in sample_user["investments"]),
    "num_loans": len(sample_user["loans"]),
    "total_loan_amount": sum(loan["amount"] for loan in sample_user["loans"]),
    "total_loan_emi": sum(loan["emi"] for loan in sample_user["loans"]),
    "num_assets": len(sample_user["assets"]),
    "total_asset_value": sum(asset["value"] for asset in sample_user["assets"]),
    "salary": sample_user["job"]["salary"]
}

# Convert to DataFrame for prediction
import pandas as pd
sample_df = pd.DataFrame([sample_features])

# Predict using trained model
predicted_credit_score = model.predict(sample_df)[0]

print("Predicted Credit Score:", predicted_credit_score)


Predicted Credit Score: 706.89


### Saving the Model

In [7]:
import joblib

# Save trained model
joblib.dump(rf, "rf_model.pkl")
joblib.dump(gb, "gb_model.pkl")
joblib.dump(meta_model , "meta_model.pkl")

['meta_model.pkl']