<a href="https://colab.research.google.com/github/sahil9022-crypto/data-science-project-all-/blob/main/Loan_Approval_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install -q yfinance
import yfinance as yf


In [7]:
# Install dependencies (Colab)
# ---------------------------
!pip install -q scikit-learn xgboost shap gradio pandas matplotlib joblib

# ---------------------------
# Imports
# ---------------------------
import os
import json
import math
import random
import requests
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import gradio as gr
import matplotlib.pyplot as plt
import shap

# ---------------------------
# 1) Create / Load Dataset
# ---------------------------
# We'll generate a synthetic dataset that resembles loan application data.
def generate_synthetic_data(n=12000, seed=42):
    random.seed(seed)
    np.random.seed(seed)
    ages = np.random.randint(21, 60, n)
    gender = np.random.choice(['Male','Female'], n, p=[0.75,0.25])
    marital = np.random.choice(['Single','Married'], n, p=[0.35,0.65])
    employment = np.random.choice(['Salaried','Self-employed','Contract'], n, p=[0.6,0.25,0.15])
    annual_income = np.round(np.random.normal(6_00_000, 2_50_000, n)).astype(int)
    annual_income = np.clip(annual_income, 60_000, 5_000_000)
    credit_score = np.round(np.random.normal(680, 60, n)).astype(int)
    credit_score = np.clip(credit_score, 300, 850)
    loan_amount = np.round(np.random.normal(5_00_000, 3_00_000, n)).astype(int)
    loan_amount = np.clip(loan_amount, 50_000, 5_000_000)
    loan_tenure_years = np.random.choice([1,2,3,5,7,10,15,20,25], n, p=[0.05,0.05,0.1,0.3,0.15,0.15,0.1,0.05,0.05])
    collateral = np.random.choice(['Yes','No'], n, p=[0.25,0.75])
    existing_emis = np.round(np.random.normal(8000,6000,n)).astype(int)
    existing_emis = np.clip(existing_emis, 0, 80_000)

    # Derived features
    loan_to_income = loan_amount / (annual_income + 1)
    dti_ratio = (existing_emis*12) / (annual_income + 1)

    # Simple rule for label generation (simulate ground truth)
    label = []
    for i in range(n):
        score = 0
        if credit_score[i] >= 700: score += 2
        elif credit_score[i] >= 650: score += 1
        if annual_income[i] > 4_00_000: score += 1
        if loan_to_income[i] < 1.5: score += 1
        if dti_ratio[i] < 0.4: score += 1
        if collateral[i] == 'Yes': score += 1
        if employment[i] == 'Salaried': score += 1
        # randomness
        score += np.random.choice([0,0,1], p=[0.7,0.2,0.1])
        label.append(1 if score >= 4 else 0)

    df = pd.DataFrame({
        'age': ages,
        'gender': gender,
        'marital_status': marital,
        'employment_type': employment,
        'annual_income': annual_income,
        'credit_score': credit_score,
        'loan_amount': loan_amount,
        'loan_tenure_years': loan_tenure_years,
        'collateral': collateral,
        'existing_emis': existing_emis,
        'loan_to_income': loan_to_income,
        'dti_ratio': dti_ratio,
        'approved': label
    })
    return df

# generate dataset
df = generate_synthetic_data(10000)
print('Dataset sample:')
print(df.head())

# Optional: allow user to upload their own dataset later

# ---------------------------
# 2) Preprocessing + Modeling
# ---------------------------
FEATURES = ['age','gender','marital_status','employment_type','annual_income','credit_score','loan_amount','loan_tenure_years','collateral','existing_emis','loan_to_income','dti_ratio']
TARGET = 'approved'

# split
X = df[FEATURES]
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# preprocessing pipeline
numeric_features = ['age','annual_income','credit_score','loan_amount','loan_tenure_years','existing_emis','loan_to_income','dti_ratio']
cat_features = ['gender','marital_status','employment_type','collateral']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) # Removed sparse=False
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', cat_transformer, cat_features)
])

# models
rf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)
log = LogisticRegression(max_iter=500)

rf_pipeline = Pipeline(steps=[('pre', preprocessor), ('clf', rf)])
log_pipeline = Pipeline(steps=[('pre', preprocessor), ('clf', log)])

# train
print('Training RandomForest...')
rf_pipeline.fit(X_train, y_train)
print('Training LogisticRegression...')
log_pipeline.fit(X_train, y_train)

# evaluate
for name, model in [('RandomForest', rf_pipeline), ('LogisticRegression', log_pipeline)]:
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f'{name} accuracy: {acc:.4f}')

# Persist models
os.makedirs('models', exist_ok=True)
joblib.dump(rf_pipeline, 'models/rf_pipeline.joblib')
joblib.dump(log_pipeline, 'models/log_pipeline.joblib')

# ---------------------------
# 3) Bank Policies (simulate dynamic bank APIs)
# ---------------------------
# You can expand this dict or load it from a remote endpoint.
BANK_POLICIES = {
    'HDFC Bank': {
        'min_credit_score': 650,
        'max_loan_amount': 2_500_000,
        'max_loan_to_income': 2.0,
        'min_annual_income': 100_000,
        'base_interest_rate': 0.095  # 9.5% (example)
    },
    'ICICI Bank': {
        'min_credit_score': 680,
        'max_loan_amount': 3_000_000,
        'max_loan_to_income': 1.8,
        'min_annual_income': 150_000,
        'base_interest_rate': 0.10
    },
    'SBI': {
        'min_credit_score': 640,
        'max_loan_amount': 5_000_000,
        'max_loan_to_income': 2.5,
        'min_annual_income': 80_000,
        'base_interest_rate': 0.085
    }
}

# ---------------------------
# 4) Real-time data fetch example (no API key)
# ---------------------------
# Example: fetch latest exchange rates (we'll use EUR/INR as a crude proxy for "market movements")
# This is only demonstrative; swap for better financial APIs for production.

def fetch_example_market_rate():
    """
    Fetch live market rate using Yahoo Finance.
    Example: INR to USD exchange rate.
    No API key required.
    """
    try:
        ticker = "INRUSD=X"   # INR to USD
        data = yf.download(ticker, period="1d", interval="1h")
        latest_rate = data["Close"].iloc[-1]
        return float(latest_rate)
    except Exception as e:
        print("Real-time fetch failed, returning fallback value. Error:", e)
        return None


# ---------------------------
# 5) Helper functions: EMI, bank rule check, explainability
# ---------------------------

def calculate_emi(principal, annual_rate, tenure_years):
    # annual_rate in decimal (e.g., 0.09), tenure_years integer
    r = annual_rate / 12
    n = tenure_years * 12
    if r == 0:
        return principal / n
    emi = principal * r * (1 + r)**n / ((1 + r)**n - 1)
    return emi


def apply_bank_rules(bank_name, applicant):
    policy = BANK_POLICIES.get(bank_name)
    reasons = []
    if policy is None:
        return False, ['Bank not found in policies']
    if applicant['credit_score'] < policy['min_credit_score']:
        reasons.append(f"Credit score below bank minimum ({applicant['credit_score']} < {policy['min_credit_score']})")
    if applicant['loan_amount'] > policy['max_loan_amount']:
        reasons.append(f"Requested loan exceeds bank's max loan amount ({applicant['loan_amount']} > {policy['max_loan_amount']})")
    if applicant['loan_to_income'] > policy['max_loan_to_income']:
        reasons.append(f"Loan-to-income ratio too high ({applicant['loan_to_income']:.2f} > {policy['max_loan_to_income']})")
    if applicant['annual_income'] < policy['min_annual_income']:
        reasons.append(f"Annual income below bank's minimum ({applicant['annual_income']} < {policy['min_annual_income']})")
    # If no reasons, bank rules satisfied
    return (len(reasons) == 0), reasons

# SHAP explainer (TreeExplainer for RandomForest)
# We'll build the explainer on training data (post-preprocessing)
print('Preparing SHAP explainer (this may take a moment)...')
pre_X = preprocessor.fit_transform(X_train)
# fit a simple TreeExplainer
try:
    explainer = shap.TreeExplainer(rf_pipeline.named_steps['clf'])
    shap_values = explainer.shap_values(pre_X)
except Exception as e:
    # fallback to KernelExplainer (slower)
    print('TreeExplainer failed:', e)
    explainer = None
    shap_values = None

# ---------------------------
# 6) Gradio Interface
# ---------------------------

# load pipelines
rf_model = joblib.load('models/rf_pipeline.joblib')
log_model = joblib.load('models/log_pipeline.joblib')

BANK_LIST = list(BANK_POLICIES.keys())

# Gradio prediction function

def predict_and_explain(age, gender, marital_status, employment_type, annual_income, credit_score, loan_amount, loan_tenure_years, collateral, existing_emis, bank_name, model_choice='RandomForest'):
    # Build applicant dict
    applicant = {
        'age': int(age),
        'gender': gender,
        'marital_status': marital_status,
        'employment_type': employment_type,
        'annual_income': float(annual_income),
        'credit_score': int(credit_score),
        'loan_amount': float(loan_amount),
        'loan_tenure_years': int(loan_tenure_years),
        'collateral': collateral,
        'existing_emis': float(existing_emis)
    }
    applicant['loan_to_income'] = applicant['loan_amount'] / (applicant['annual_income'] + 1)
    applicant['dti_ratio'] = (applicant['existing_emis']*12) / (applicant['annual_income'] + 1)

    X_app = pd.DataFrame([applicant])[FEATURES]

    model = rf_model if model_choice == 'RandomForest' else log_model
    proba = model.predict_proba(X_app)[0][1]
    pred = int(proba >= 0.5)

    # Bank-specific rules
    bank_ok, bank_reasons = apply_bank_rules(bank_name, applicant)

    # Decide final verdict: require both ML positive and bank rules satisfied (configurable)
    final_approved = pred == 1 and bank_ok

    # EMI calculation using bank base rate + small risk premium
    bank_policy = BANK_POLICIES.get(bank_name)
    base_rate = bank_policy['base_interest_rate'] if bank_policy else 0.10
    # risk premium inversely proportional to credit score
    risk_premium = max(0, (700 - applicant['credit_score'])/2000)
    annual_rate = base_rate + risk_premium
    emi = calculate_emi(applicant['loan_amount'], annual_rate, applicant['loan_tenure_years'])

    # Real-time fetch example
    market_rate = fetch_example_market_rate()

    # Explainability: SHAP force plot saved as image
    shap_image_path = None
    try:
        # compute shap values for this single example using TreeExplainer
        prep = rf_model.named_steps['pre']
        clf = rf_model.named_steps['clf']
        Xp = prep.transform(X_app)
        expl = shap.TreeExplainer(clf)
        sv = expl.shap_values(Xp)[1]
        # make a bar plot of absolute shap values per feature
        feature_names = (
            numeric_features + list(prep.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_features))
        )
        vals = np.abs(sv.flatten())
        # match length
        vals = vals[:len(feature_names)]
        fig, ax = plt.subplots(figsize=(8,4))
        ix = np.argsort(vals)[-8:][::-1]
        ax.barh([feature_names[i] for i in ix][::-1], vals[ix][::-1])
        ax.set_title('Top SHAP feature importances (abs) for this prediction')
        plt.tight_layout()
        shap_image_path = 'shap_explain.png'
        fig.savefig(shap_image_path)
        plt.close(fig)
    except Exception as e:
        print('SHAP explain failed:', e)
        shap_image_path = None

    reasons = []
    if pred == 0:
        reasons.append('Model predicted REJECT (low probability)')
    else:
        reasons.append('Model predicted APPROVE (probability high)')
    if not bank_ok:
        reasons += bank_reasons
    if pred == 1 and bank_ok:
        reasons.append('Both model and bank rules satisfied')
    verdict = 'APPROVED' if final_approved else 'REJECTED'

    output = {
        'verdict': verdict,
        'model_probability': f'{proba*100:.2f}%',
        'bank_base_rate': f'{base_rate*100:.2f}%',
        'applied_annual_rate': f'{annual_rate*100:.2f}%',
        'emi_monthly': f'{emi:,.2f}',
        'market_rate_example': market_rate,
        'reasons': reasons
    }

    return output, shap_image_path

# Gradio UI components
with gr.Blocks() as demo:
    gr.Markdown('# Loan Approval Prediction (Dynamic)')
    with gr.Row():
        with gr.Column(scale=2):
            age = gr.Slider(21, 70, value=30, label='Age')
            gender = gr.Dropdown(['Male','Female'], value='Male', label='Gender')
            marital_status = gr.Dropdown(['Single','Married'], value='Single', label='Marital Status')
            employment_type = gr.Dropdown(['Salaried','Self-employed','Contract'], value='Salaried', label='Employment Type')
            annual_income = gr.Number(value=600000, label='Annual Income (INR)')
            credit_score = gr.Slider(300,850, value=680, label='Credit Score')
            loan_amount = gr.Number(value=500000, label='Requested Loan Amount (INR)')
            loan_tenure_years = gr.Dropdown([1,2,3,5,7,10,15,20,25], value=5, label='Loan Tenure (years)')
            collateral = gr.Dropdown(['Yes','No'], value='No', label='Collateral Available')
            existing_emis = gr.Number(value=0, label='Existing Monthly EMIs (INR)')
            bank_name = gr.Dropdown(BANK_LIST, value=BANK_LIST[0], label='Select Bank')
            model_choice = gr.Radio(['RandomForest','LogisticRegression'], value='RandomForest', label='Choose Model')
            submit = gr.Button('Predict')
        with gr.Column(scale=1):
            output_box = gr.JSON(label='Prediction & Details')
            shap_out = gr.Image(label='SHAP Explanation (if available)')

    def wrapped_predict(*args):
        out, img = predict_and_explain(*args)
        return out, img

    submit.click(wrapped_predict, inputs=[age, gender, marital_status, employment_type, annual_income, credit_score, loan_amount, loan_tenure_years, collateral, existing_emis, bank_name, model_choice], outputs=[output_box, shap_out])

# Launch Gradio (share=True gives a public URL in Colab)
demo.launch(share=True)

Dataset sample:
   age  gender marital_status employment_type  annual_income  credit_score  \
0   59    Male        Married        Salaried         270033           623   
1   49    Male         Single        Contract         860330           745   
2   35    Male         Single        Salaried        1353272           609   
3   28  Female        Married        Salaried         925298           573   
4   41    Male        Married        Contract         902676           621   

   loan_amount  loan_tenure_years collateral  existing_emis  loan_to_income  \
0       294031                  7        Yes              0        1.088867   
1       709243                  7         No          14290        0.824384   
2       732806                  3         No          22660        0.541506   
3       445047                  5        Yes           5261        0.480976   
4       858587                  5         No           5221        0.951156   

   dti_ratio  approved  
0   0.000000   



📊 Loan Approval Prediction System
🔹 Project Overview

This project is a dynamic data science application that predicts loan approvals for customers based on their profile. It works with an interactive Gradio dashboard where users enter their details (income, employment, loan amount, credit history, etc.).

Additionally, the project fetches real-time financial data using Yahoo Finance (yfinance) (e.g., INR/USD exchange rate, Sensex, Nifty). This allows the system to dynamically adjust conditions for loan approvals based on current market trends.

🔹 Features

✅ Interactive Gradio Dashboard – Simple UI to enter customer details.
✅ Dynamic Loan Approval Prediction – Model predicts loan approval instantly.
✅ Bank Recommendation – Suggests which bank is most likely to approve the loan.
✅ Real-Time Financial Data (Yahoo Finance) – No API key required.
✅ Deployed-ready – Can be hosted on Hugging Face or Colab public link.

🔹 Tech Stack

Python 3.10+

Gradio → Interactive dashboard

Scikit-learn → Machine Learning model

Pandas, Numpy → Data preprocessing

Matplotlib, Seaborn → Data visualization

yfinance → Real-time market rates (no API key needed)

🔹 Project Workflow

Data Preprocessing – Clean and transform customer dataset.

Model Training – Logistic Regression / Random Forest classifier.

Feature Input (Gradio) – User enters details like:

Applicant Income

Employment Status

Loan Amount

Credit History

Property Area

Gender, Marital Status, Dependents

Prediction – Model predicts if the loan will be approved ✅ or rejected ❌.

Bank Recommendation – Suggests suitable banks with conditions.

Dynamic Market Data – Yahoo Finance data (e.g., INR/USD) adjusts loan approval thresholds.

🔹 Setup Instructions (Google Colab / Local)
1️⃣ Install dependencies
!pip install gradio scikit-learn pandas numpy matplotlib seaborn yfinance

2️⃣ Run the app
import gradio as gr
gr.Interface(fn=predict_loan, inputs=inputs, outputs=outputs).launch()

3️⃣ Public link (Colab)
gr.Interface(fn=predict_loan, inputs=inputs, outputs=outputs).launch(share=True)

🔹 Example Screenshot

(Add screenshot of your Gradio dashboard here once you run it)

🔹 Future Enhancements

✅ Deploy on Hugging Face Spaces (Free Hosting).

✅ Add multiple bank APIs for live loan policy integration.

✅ Advanced models (XGBoost, Neural Networks).

✅ User authentication & history tracking.

🔹 Author

👨‍💻 Sahil Pawar
📍 Sangli, Maharashtra
📧 Email: [publichacker9999@gmail.com]
