In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Extract IDs for submission
test_ids = test['id']

# Prepare Data
def preprocess_data(df):
    df = df.copy()
    
    # Label Encode categorical features (for training purposes)
    cat_features = ['person_home_ownership', 'loan_intent', 'loan_grade']
    for col in cat_features:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    
    # Label encode 'cb_person_default_on_file' since it is categorical (it is still a string column)
    le_default = LabelEncoder()
    df['cb_person_default_on_file'] = le_default.fit_transform(df['cb_person_default_on_file'])
    
    # Fill missing values with a specific value (for instance, -999)
    df.fillna(-999, inplace=True)
    
    return df

train = preprocess_data(train)
test = preprocess_data(test)

# One-hot encode categorical features using pandas get_dummies
cat_columns = ['person_home_ownership', 'loan_intent', 'loan_grade']

# One-Hot Encoding (drop_first=True to avoid multicollinearity)
train = pd.get_dummies(train, columns=cat_columns, drop_first=True)
test = pd.get_dummies(test, columns=cat_columns, drop_first=True)

# Ensure both train and test have the same columns after one-hot encoding
#train, test = train.align(test, join='left', axis=1)

# Fill any NaN values created by alignment (in case some categories are missing in the test set)
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

# Drop 'id' and target column 'loan_status' from the training set
X = train.drop(columns=['id', 'loan_status'])
y = train['loan_status']
X_test = test.drop(columns=['id'])  # No 'loan_status' column in the test set

# Standardize numeric features (after one-hot encoding)
scaler = StandardScaler()
X = scaler.fit_transform(X)  # Apply scaling to training data
X_test = scaler.transform(X_test)  # Apply the same scaling to test data

# Cross-Validation
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Models
models = {
    'LightGBM': LGBMClassifier(n_estimators=1000, learning_rate=0.01, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=1000, learning_rate=0.01, eval_metric='logloss', random_state=42),
    'CatBoost': CatBoostClassifier(iterations=1000, learning_rate=0.01, verbose=0, random_state=42)
}

# Training and Prediction
predictions = np.zeros((X_test.shape[0], len(models)))
meta_features = np.zeros((X.shape[0], len(models)))

for i, (model_name, model) in enumerate(models.items()):
    print(f'Training {model_name}...')
    fold_predictions = np.zeros(X_test.shape[0])
    fold_meta_features = np.zeros(X.shape[0])
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        print(f'Fold {fold+1}/{n_splits}')
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        if model_name == 'LightGBM':
            # For LightGBM, apply early stopping properly
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
        else:
            model.fit(X_train, y_train)
        
        fold_meta_features[val_idx] = model.predict_proba(X_val)[:, 1]
        fold_predictions += model.predict_proba(X_test)[:, 1] / n_splits
    
    meta_features[:, i] = fold_meta_features
    predictions[:, i] = fold_predictions
    print(f'{model_name} CV AUC: {roc_auc_score(y, fold_meta_features):.4f}')

# Train a meta-model on the stacked features
meta_model = CatBoostClassifier(iterations=1000, learning_rate=0.01, random_state=42)
meta_model.fit(meta_features, y)
meta_predictions = meta_model.predict_proba(predictions)[:, 1]

# Final Predictions
submission = pd.DataFrame({'id': test_ids, 'loan_status': meta_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")

Training LightGBM...
Fold 1/5
[LightGBM] [Info] Number of positive: 6680, number of negative: 40236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 910
[LightGBM] [Info] Number of data points in the train set: 46916, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142382 -> initscore=-1.795644
[LightGBM] [Info] Start training from score -1.795644
Fold 2/5
[LightGBM] [Info] Number of positive: 6680, number of negative: 40236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 909
[LightGBM] [Info] Number of data points in the train set: 46916, number of u

Parameters: { "use_label_encoder" } are not used.



Fold 2/5


Parameters: { "use_label_encoder" } are not used.



Fold 3/5


Parameters: { "use_label_encoder" } are not used.



Fold 4/5


Parameters: { "use_label_encoder" } are not used.



Fold 5/5


Parameters: { "use_label_encoder" } are not used.



XGBoost CV AUC: 0.9519
Training CatBoost...
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
CatBoost CV AUC: 0.9427
0:	learn: 0.6753381	total: 10.6ms	remaining: 10.6s
1:	learn: 0.6576542	total: 20.7ms	remaining: 10.3s
2:	learn: 0.6411876	total: 31.5ms	remaining: 10.5s
3:	learn: 0.6256346	total: 42.3ms	remaining: 10.5s
4:	learn: 0.6100254	total: 52.6ms	remaining: 10.5s
5:	learn: 0.5960987	total: 64.6ms	remaining: 10.7s
6:	learn: 0.5815160	total: 76.1ms	remaining: 10.8s
7:	learn: 0.5675095	total: 88ms	remaining: 10.9s
8:	learn: 0.5542823	total: 101ms	remaining: 11.1s
9:	learn: 0.5418066	total: 112ms	remaining: 11.1s
10:	learn: 0.5290206	total: 127ms	remaining: 11.4s
11:	learn: 0.5167922	total: 139ms	remaining: 11.4s
12:	learn: 0.5048366	total: 151ms	remaining: 11.4s
13:	learn: 0.4932757	total: 162ms	remaining: 11.4s
14:	learn: 0.4822154	total: 174ms	remaining: 11.4s
15:	learn: 0.4715410	total: 189ms	remaining: 11.6s
16:	learn: 0.4609051	total: 203ms	remaining: 11.8s
17:	learn: 0.4513749	tot

In [2]:
import joblib

# After training the model
joblib.dump(meta_model, 'loan_model.pkl')  # Save the trained model

['loan_model.pkl']

In [26]:
import tkinter as tk
from tkinter import messagebox
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Load the trained model
model = joblib.load('loan_model.pkl')

# Function to preprocess user input data
def preprocess_user_input(data):
    # Create a DataFrame from the input data
    df = pd.DataFrame([data])
    #print(df.head())
    # Label Encoding categorical columns
    #cat_columns = ['person_home_ownership', 'loan_intent', 'loan_grade']
    #for col in cat_columns:
        #le = LabelEncoder()
        #df[col] = le.fit_transform(df[col])

    # Standardize the data
    #scaler = StandardScaler()
    #df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    print(df.head())
    return df

# Function to predict loan eligibility
def predict_loan():
    try:
        # Collecting the data from GUI inputs
        person_age = int(age_entry.get())
        person_income = int(income_entry.get())
        person_emp_length = float(emp_length_entry.get())
        loan_amnt = float(loan_amnt_entry.get())
        loan_int_rate = float(loan_int_rate_entry.get())
        loan_percent_income = float(loan_percent_income_entry.get())
        home_ownership = home_ownership_var.get()
        loan_intent = loan_intent_var.get()
        loan_grade = loan_grade_var.get()

        # Create the input data dictionary
        input_data = {
            'person_age': person_age,
            'person_income': person_income,
            'person_emp_length': person_emp_length,
            'loan_amnt': loan_amnt,
            'loan_int_rate': loan_int_rate,
            'loan_percent_income': loan_percent_income,
            'person_home_ownership': home_ownership,
            'loan_intent': loan_intent,
            'loan_grade': loan_grade,
        }

        # Preprocess the data
        preprocessed_data = preprocess_user_input(input_data)

        # Make the prediction
        prediction = model.predict(preprocessed_data)[0]

        # Display result
        if prediction > 0.1:
            messagebox.showinfo("Loan Status", "Congratulations! You are eligible for the loan.")
        else:
            messagebox.showinfo("Loan Status", "Sorry, you are not eligible for the loan. Prediction: "+str(prediction))
    except ValueError:
        messagebox.showerror("Input Error", "Please enter valid numerical values.")

# Create main window
root = tk.Tk()
root.title("Loan Eligibility Prediction")

# Create and place labels and entry widgets
tk.Label(root, text="Age:").grid(row=0, column=0)
age_entry = tk.Entry(root)
age_entry.grid(row=0, column=1)

tk.Label(root, text="Income:").grid(row=1, column=0)
income_entry = tk.Entry(root)
income_entry.grid(row=1, column=1)

tk.Label(root, text="Employment Length (in years):").grid(row=2, column=0)
emp_length_entry = tk.Entry(root)
emp_length_entry.grid(row=2, column=1)

tk.Label(root, text="Loan Amount:").grid(row=3, column=0)
loan_amnt_entry = tk.Entry(root)
loan_amnt_entry.grid(row=3, column=1)

tk.Label(root, text="Loan Interest Rate (%):").grid(row=4, column=0)
loan_int_rate_entry = tk.Entry(root)
loan_int_rate_entry.grid(row=4, column=1)

tk.Label(root, text="Loan Percent of Income:").grid(row=5, column=0)
loan_percent_income_entry = tk.Entry(root)
loan_percent_income_entry.grid(row=5, column=1)

tk.Label(root, text="Home Ownership:").grid(row=6, column=0)
home_ownership_var = tk.StringVar()
home_ownership_var.set("1")  # Default value
home_ownership_menu = tk.OptionMenu(root, home_ownership_var, "1", "2", "3")
home_ownership_menu.grid(row=6, column=1)

tk.Label(root, text="Loan Intent:").grid(row=7, column=0)
loan_intent_var = tk.StringVar()
loan_intent_var.set("1")  # Default value
loan_intent_menu = tk.OptionMenu(root, loan_intent_var, "1", "2", "3", "4", "5")
loan_intent_menu.grid(row=7, column=1)

tk.Label(root, text="Loan Grade:").grid(row=8, column=0)
loan_grade_var = tk.StringVar()
loan_grade_var.set("1")  # Default value
loan_grade_menu = tk.OptionMenu(root, loan_grade_var, "1", "2", "3", "4", "5", "6")
loan_grade_menu.grid(row=8, column=1)

# Prediction button
predict_button = tk.Button(root, text="Check Loan Eligibility", command=predict_loan)
predict_button.grid(row=9, column=0, columnspan=2)

# Run the GUI
root.mainloop()
