# 🤖 Bank Loan & Fraud Detection - Machine Learning Model
This notebook builds a machine learning model to predict:

✅ Loan Approval Status
✅ Default Risk Assessment
✅ Fraud Detection

✅ Addressing real-world challenges such as:
- Handling class imbalance
- Regularization using Ridge and Lasso
- Cross-validation for robust evaluation
- Residual analysis for model validation

In [None]:

# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


In [None]:

# Load the processed dataset from EDA
df_ml = pd.read_csv('Bank_Data_RealWorld.csv')

# Define feature columns and target variables
feature_cols = ['Age', 'Income', 'LoanAmount', 'LoanTerm', 'CreditHistory', 'TransactionCount', 
                'PropertyArea_Urban', 'PropertyArea_Semiurban', 'MaritalStatus_Married', 'MaritalStatus_Divorced']

X = df_ml[feature_cols]
y_loan_status = df_ml['LoanStatus']
y_default_risk = df_ml['DefaultRisk']
y_fraud_flag = df_ml['FraudFlag']

# Scale the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data for Loan Approval Prediction
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_scaled, y_loan_status, test_size=0.2, random_state=42, stratify=y_loan_status)


In [None]:

# --- Loan Approval Prediction ---
logistic_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the models
logistic_model.fit(X_train_l, y_train_l)
rf_model.fit(X_train_l, y_train_l)

# Predictions
y_pred_logistic_l = logistic_model.predict(X_test_l)
y_pred_rf_l = rf_model.predict(X_test_l)

# Evaluation Metrics
print("Logistic Regression Accuracy:", accuracy_score(y_test_l, y_pred_logistic_l))
print("Random Forest Accuracy:", accuracy_score(y_test_l, y_pred_rf_l))
print("\nClassification Report (Logistic):\n", classification_report(y_test_l, y_pred_logistic_l))
print("\nClassification Report (RF):\n", classification_report(y_test_l, y_pred_rf_l))


In [None]:

# --- Handling Class Imbalance using SMOTE ---
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_loan_status)

# Split resampled data
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Refit models on resampled data
logistic_model.fit(X_train_res, y_train_res)
rf_model.fit(X_train_res, y_train_res)

# Predictions on resampled data
y_pred_resampled_logistic = logistic_model.predict(X_test_res)
y_pred_resampled_rf = rf_model.predict(X_test_res)

# Evaluation
print("Logistic Regression Accuracy after SMOTE:", accuracy_score(y_test_res, y_pred_resampled_logistic))
print("Random Forest Accuracy after SMOTE:", accuracy_score(y_test_res, y_pred_resampled_rf))


In [None]:

# --- Fraud and Default Prediction ---
# Split for Fraud Detection
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_scaled, y_fraud_flag, test_size=0.2, random_state=42, stratify=y_fraud_flag)

# Fit Random Forest for Fraud Detection
rf_fraud = RandomForestClassifier(n_estimators=100, random_state=42)
rf_fraud.fit(X_train_f, y_train_f)

# Predictions for Fraud
y_pred_fraud = rf_fraud.predict(X_test_f)

# Evaluation for Fraud
print("Fraud Detection Accuracy:", accuracy_score(y_test_f, y_pred_fraud))
print("\nClassification Report (Fraud Detection):\n", classification_report(y_test_f, y_pred_fraud))
