# 📚 Loan Approval Prediction - Real-World Enhanced Model
This notebook includes complete data preprocessing and model optimization for:
- **Loan Approval Prediction Dataset**
✅ Feature selection using P-values and VIF
✅ Handling outliers using Z-score/IQR
✅ Ridge and Lasso Regularization to prevent overfitting
✅ Residual analysis for model assumptions
✅ Handling missing data
✅ Scaling and transforming features
✅ Proper handling of categorical variables

In [None]:

# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [None]:

# Loan Approval Dataset
data_loan = {
    'LoanAmount': [150, 120, 180, 130, 140, 200, 160, 110, 170, 190],
    'ApplicantIncome': [4000, 3500, 6000, 3000, 4500, 7500, 4200, 3200, 5800, 6200],
    'CoapplicantIncome': [0, 1500, 1800, 0, 1200, 2000, 0, 800, 1500, 0],
    'LoanTerm': [360, 180, 360, 240, 360, 180, 300, 360, 360, 180],
    'CreditHistory': [1, 0, 1, 1, 1, 0, 1, 1, 0, 1],
    'PropertyArea': ['Urban', 'Rural', 'Urban', 'Semiurban', 'Urban', 'Rural', 'Semiurban', 'Urban', 'Rural', 'Urban'],
    'LoanStatus': [1, 0, 1, 1, 1, 0, 1, 0, 0, 1]
}

# Create DataFrame
df_loan = pd.DataFrame(data_loan)

# --- Check for Missing Values ---
print("Loan Data Missing Values:")
print(df_loan.isnull().sum())

# --- Outlier Detection & Removal ---
z_scores_loan = np.abs(stats.zscore(df_loan.select_dtypes(include=[np.number])))
df_loan_clean = df_loan[(z_scores_loan < 3).all(axis=1)]

# --- One-Hot Encoding for Categorical Variables ---
df_loan_encoded = pd.get_dummies(df_loan_clean, columns=['PropertyArea'], drop_first=True)

# --- Feature Selection ---
X_loan = df_loan_encoded.drop(columns=['LoanStatus'])
y_loan = df_loan_encoded['LoanStatus']

# Add constant for intercept
X_loan_const = sm.add_constant(X_loan)
model_sm_loan = sm.OLS(y_loan, X_loan_const).fit()
print(model_sm_loan.summary())

# Drop insignificant features if P-value > 0.05
X_loan_selected = X_loan_const.drop(columns=['CoapplicantIncome'], errors='ignore')

# --- VIF Check ---
vif_data_loan = pd.DataFrame()
vif_data_loan["Feature"] = X_loan_selected.columns
vif_data_loan["VIF"] = [variance_inflation_factor(X_loan_selected.values, i) for i in range(X_loan_selected.shape[1])]
print("
Loan Data VIF:
", vif_data_loan)

# --- Feature Scaling ---
scaler_loan = StandardScaler()
X_loan_scaled = scaler_loan.fit_transform(X_loan_selected)


In [None]:

# --- Train-Test Split ---
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_loan_scaled, y_loan, test_size=0.2, random_state=0)

# --- Logistic Regression, Ridge, and Lasso for Loan Status ---
logistic_loan = LogisticRegression(max_iter=1000)
ridge_loan = RidgeClassifier(alpha=1.0)
lasso_loan = LassoCV(cv=5, max_iter=1000)

# Fit models
logistic_loan.fit(X_train_l, y_train_l)
ridge_loan.fit(X_train_l, y_train_l)

# Predictions
y_pred_logistic_l = logistic_loan.predict(X_test_l)
y_pred_ridge_l = ridge_loan.predict(X_test_l)

# --- Evaluation for Loan Models ---
print("Logistic Regression Accuracy:", accuracy_score(y_test_l, y_pred_logistic_l))
print("Ridge Classifier Accuracy:", accuracy_score(y_test_l, y_pred_ridge_l))
print("
Classification Report (Logistic):
", classification_report(y_test_l, y_pred_logistic_l))
print("
Confusion Matrix (Logistic):
", confusion_matrix(y_test_l, y_pred_logistic_l))


In [None]:

# --- Residual Analysis for Loan Data ---
residuals_l = y_test_l - y_pred_logistic_l
plt.figure(figsize=(6, 4))
sns.histplot(residuals_l, kde=True, bins=10)
plt.title('Residuals for Loan Prediction Model')
plt.xlabel('Residuals')
plt.show()
