In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

In [2]:
train_df = pd.read_csv("./data/loan-train.csv")
test_df = pd.read_csv("./data/loan-test.csv")

In [3]:
# Drop Loan_ID (not useful for prediction)
train_df.drop(columns=['Loan_ID'], inplace=True)
test_df.drop(columns=['Loan_ID'], inplace=True)


In [4]:
# Convert Loan_Status to binary (Y -> 1, N -> 0)
train_df['Loan_Status'] = train_df['Loan_Status'].map({'Y': 1, 'N': 0})

In [5]:
# Handle Missing Values
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)
    test_df[col].fillna(test_df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mode()[0], inplace=True)


In [6]:
def detect_outliers(data, col):
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return data[(data[col] < lower_bound) | (data[col] > upper_bound)]

outliers = detect_outliers(train_df, 'LoanAmount')

In [7]:
len(outliers)

39

In [8]:
if len(outliers) > 0:
    imputer = SimpleImputer(strategy='median')
else:
    imputer = SimpleImputer(strategy='mean')

In [9]:
train_df['LoanAmount'] = imputer.fit_transform(train_df[['LoanAmount']])
test_df['LoanAmount'] = imputer.transform(test_df[['LoanAmount']])

In [10]:
# Loan_Amount_Term - Fill missing with mode
train_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].mode()[0], inplace=True)
test_df['Loan_Amount_Term'].fillna(test_df['Loan_Amount_Term'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Loan_Amount_Term'].fillna(test_df['Loan_Amount_Term'].mode()[0], inplace=True)


In [11]:
# Feature Engineering
train_df['TotalIncome'] = train_df['ApplicantIncome'] + train_df['CoapplicantIncome']
test_df['TotalIncome'] = test_df['ApplicantIncome'] + test_df['CoapplicantIncome']
train_df.drop(columns=['ApplicantIncome', 'CoapplicantIncome'], inplace=True)
test_df.drop(columns=['ApplicantIncome', 'CoapplicantIncome'], inplace=True)

In [12]:
# Highlight: Replace LabelEncoder with manual category mapping
category_mappings = {}
for col in ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']:
    unique_values = train_df[col].unique()
    category_mappings[col] = {val: idx for idx, val in enumerate(unique_values)}

# Apply mappings to encode the categorical columns
for col in category_mappings:
    train_df[col] = train_df[col].map(category_mappings[col])
    test_df[col] = test_df[col].map(category_mappings[col])

# Save category mappings
joblib.dump(category_mappings, 'category_mappings.pkl')

['category_mappings.pkl']

In [13]:
train_df.head(2)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome
0,0,0,0,0,0,128.0,360.0,1.0,0,1,5849.0
1,0,1,1,0,0,128.0,360.0,1.0,1,0,6091.0


In [14]:
train_df["Loan_Status"].value_counts()

Loan_Status
1    422
0    192
Name: count, dtype: int64

In [15]:
# Split Data
X = train_df.drop(columns=['Loan_Status'])
y = train_df['Loan_Status']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
# Handle Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [17]:
X_train[:5]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,TotalIncome
0,0,0,0,0,0,50.0,360.0,1.0,0,3254.0
1,0,1,1,0,0,96.0,360.0,1.0,2,3315.0
2,0,1,2,0,0,150.0,360.0,0.0,1,5050.0
3,0,1,1,1,0,113.0,180.0,0.0,1,4153.0
4,0,1,0,0,0,150.0,360.0,1.0,2,4843.0


In [18]:
# Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(test_df)

In [19]:
# Save scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [20]:
X_train[:5]

array([[-0.4025779 , -1.27380065, -0.71329442, -0.47247274, -0.34851718,
        -1.20393954,  0.27173504,  0.57717284, -1.12482547, -0.61618795],
       [-0.4025779 ,  0.7850522 ,  0.35744153, -0.47247274, -0.34851718,
        -0.62683394,  0.27173504,  0.57717284,  1.3089863 , -0.60616044],
       [-0.4025779 ,  0.7850522 ,  1.42817747, -0.47247274, -0.34851718,
         0.05063786,  0.27173504, -1.98048566,  0.09208042, -0.32095153],
       [-0.4025779 ,  0.7850522 ,  0.35744153,  2.11652423, -0.34851718,
        -0.41355578, -2.63940916, -1.98048566,  0.09208042, -0.46840535],
       [-0.4025779 ,  0.7850522 , -0.71329442, -0.47247274, -0.34851718,
         0.05063786,  0.27173504,  0.57717284,  1.3089863 , -0.35497933]])

In [21]:
# Train and Compare Models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}

In [22]:
best_model = None
best_accuracy = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, y_pred)
    print(f'{name} Accuracy: {acc:.4f}')
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

Logistic Regression Accuracy: 0.8211
Random Forest Accuracy: 0.7805
XGBoost Accuracy: 0.8049
[LightGBM] [Info] Number of positive: 337, number of negative: 337
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 553
[LightGBM] [Info] Number of data points in the train set: 674, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM Accuracy: 0.8130


Parameters: { "use_label_encoder" } are not used.



In [23]:
print(best_model)
print(best_accuracy)

LogisticRegression()
0.8211382113821138


In [24]:
# Save Best Model
joblib.dump(best_model, 'best_loan_model.pkl')

['best_loan_model.pkl']

In [25]:
# Load Model & Make Predictions
loaded_model = joblib.load('best_loan_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# Inference Function
category_mappings = joblib.load('category_mappings.pkl')

In [26]:
y_test_pred = loaded_model.predict(X_test)
print(f'Predictions on test set: {y_test_pred}')

Predictions on test set: [1 1 1 1 0 1 1 0 1 1 0 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1 1 1 1 1 0 1
 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1
 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0
 1 0 0 1 1 0 1 1 1 1 1 1 0 0 0 1 1 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 0 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1]


In [27]:
# Evaluate Model Performance
y_valid_pred = loaded_model.predict(X_valid)
print("\nValidation Set Performance:")
print(f'Accuracy: {accuracy_score(y_valid, y_valid_pred):.4f}')
print("Classification Report:")
print(classification_report(y_valid, y_valid_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_valid, y_valid_pred))


Validation Set Performance:
Accuracy: 0.8211
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.63      0.69        38
           1       0.85      0.91      0.88        85

    accuracy                           0.82       123
   macro avg       0.80      0.77      0.78       123
weighted avg       0.82      0.82      0.82       123

Confusion Matrix:
[[24 14]
 [ 8 77]]


## Inferencing

In [43]:
def test_predict_loan_eligibility():
    # Define expected feature order (MUST MATCH training)
    expected_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 
                         'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'TotalIncome']
    
    # Sample input (must match the expected features)
    sample_input = {
        'Gender': 'Female',
        'Married': 'Yes',
        'Dependents': '1',
        'Education': 'Graduate',
        'Self_Employed': 'No',
        'LoanAmount': 12000000,
        'Loan_Amount_Term': 360,
        'Credit_History': 1.0,
        'Property_Area': 'Urban',
        'TotalIncome': 50000000000000  # Instead of ApplicantIncome and CoapplicantIncome
    }

    # Convert to DataFrame
    user_df = pd.DataFrame([sample_input])

    # Ensure all required features exist (avoiding KeyError)
    for col in expected_features:
        if col not in user_df:
            user_df[col] = np.nan  # Add missing columns to maintain order

    # Reorder columns to match training data
    user_df = user_df[expected_features]

    # Load category mappings and encode categorical features
    category_mappings = joblib.load('category_mappings.pkl')
    for col, mapping in category_mappings.items():
        user_df[col] = user_df[col].map(mapping).fillna(-1)  # Assign -1 for unseen values

    # Load scaler and transform input
    loaded_scaler = joblib.load('scaler.pkl')
    user_scaled = loaded_scaler.transform(user_df)

    # Load Model & Predict
    loaded_model = joblib.load('best_loan_model.pkl')
    prediction = loaded_model.predict(user_scaled)

    result = 'Eligible' if prediction[0] == 1 else 'Not Eligible'
    print(f'Loan Prediction: {result}')



In [44]:
# Call the test function
test_predict_loan_eligibility()

Loan Prediction: Not Eligible
