In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [35]:
df=pd.read_excel(r'D:\training\loan\loan_ml.xlsx')
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Equated_Monthly_installment,Balance_Income,Total_Income_log,LoanAmount__log
0,Male,No,0,Graduate,No,5849,0,360,1,Urban,1,0.41,5442.3,8.674026,4.986411
1,Male,Yes,1,Graduate,No,4583,1508,360,1,Rural,0,0.36,5735.44,8.714568,4.85203
2,Male,Yes,0,Graduate,Yes,3000,0,360,1,Urban,1,0.18,2816.67,8.006368,4.189655
3,Male,Yes,0,Not Graduate,No,2583,2358,360,1,Urban,1,0.33,4607.67,8.505323,4.787492
4,Male,No,0,Graduate,No,6000,0,360,1,Urban,1,0.39,5608.33,8.699515,4.94876


-----------------------------------------------------------------------------------------------------------------------------

### Piplines

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from category_encoders import BinaryEncoder
from sklearn.linear_model import LogisticRegression
from category_encoders import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [37]:
obj_features=df.select_dtypes(include='O')
for feature in obj_features:
    print(f"Feature {feature} has {obj_features[feature].nunique()} unique values ")

Feature Gender has 2 unique values 
Feature Married has 2 unique values 
Feature Dependents has 4 unique values 
Feature Education has 2 unique values 
Feature Self_Employed has 2 unique values 
Feature Property_Area has 3 unique values 


In [38]:
Encoder = ColumnTransformer(transformers=[
    ('OHE', OneHotEncoder(), ['Gender', 'Married', 'Dependents','Education', 'Self_Employed', 'Property_Area'])], remainder='passthrough')

In [39]:
steps=[]
steps.append(('Encoder',Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("Model" , LogisticRegression()))
pipeline = Pipeline(steps=steps)
pipeline

In [40]:
X=df.drop('Loan_Status',axis=1)
y=df['Loan_Status']

In [10]:
results = cross_validate(pipeline , X ,y , cv = 5 , scoring="f1" , return_train_score=True)
results

{'fit_time': array([7.97156763, 6.12556911, 0.28118348, 1.54650855, 0.26556087]),
 'score_time': array([0.06699252, 0.04686427, 0.04686379, 0.06248617, 0.06248689]),
 'test_score': array([0.87958115, 0.86010363, 0.85863874, 0.90217391, 0.86486486]),
 'train_score': array([0.87798408, 0.88502674, 0.88328912, 0.87253614, 0.87649402])}

In [11]:
results["train_score"].mean()

0.8790660216167309

In [12]:
results["test_score"].mean()

0.8730724600278613

In [13]:
models = list()
models.append(("LR" , LogisticRegression()))
models.append(("KNN" , KNeighborsClassifier()))
models.append(("CART" , DecisionTreeClassifier()))
models.append(("RF" , RandomForestClassifier()))
models.append(("xg" , XGBClassifier()))
models.append(("SVC" , SVC()))
models.append(("Naive Bayes" , GaussianNB()))

In [14]:
for model in models:
    steps = []
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps=steps)
    scores = cross_validate(pipeline ,X ,y , cv = 5 , scoring="f1" , return_train_score=True)
    print(model[0])
    print("Train_accuracy" , scores["train_score"].mean() )
    print("-" * 10)
    print("Test_accuracy" , scores["test_score"].mean())
    print("-" * 20)
    print("\n")
    

LR
Train_accuracy 0.8790660216167309
----------
Test_accuracy 0.8730724600278613
--------------------


KNN
Train_accuracy 0.8615093365333542
----------
Test_accuracy 0.803062820281726
--------------------


CART
Train_accuracy 1.0
----------
Test_accuracy 0.7874264281701654
--------------------


RF
Train_accuracy 1.0
----------
Test_accuracy 0.852590474653951
--------------------


xg
Train_accuracy 1.0
----------
Test_accuracy 0.8417945190758077
--------------------


SVC
Train_accuracy 0.8156279952013993
----------
Test_accuracy 0.8116851421723258
--------------------


Naive Bayes
Train_accuracy 0.8686666102883864
----------
Test_accuracy 0.8626145051995161
--------------------




In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Define your data X and y

# Perform oversampling to balance the dataset
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

Encoder = ColumnTransformer(transformers=[
    ('OHE', OneHotEncoder(sparse=False , drop="first"), ['Gender', 'Married', 'Dependents','Education', 'Self_Employed', 'Property_Area'])], remainder='passthrough')

params = {
    'model__n_neighbors': [3, 5, 7,10,15,20],
    'model__weights': ['uniform', 'distance'],
    'model__algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'model__p': [1, 2],
    'model__leaf_size': [20, 30, 40]
}

steps = []
steps.append(("encoder", Encoder))
steps.append(("scaler", RobustScaler()))
steps.append(("model", KNeighborsClassifier()))

pipeline = Pipeline(steps=steps)

# Apply GridSearchCV on the transformed data
grid_search = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring="f1", return_train_score=True, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

# Evaluate the model on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


Best Parameters:  {'model__algorithm': 'auto', 'model__leaf_size': 20, 'model__n_neighbors': 20, 'model__p': 1, 'model__weights': 'distance'}
Best Score:  0.83752151426166
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86        94
           1       0.85      0.80      0.82        75

    accuracy                           0.85       169
   macro avg       0.85      0.84      0.84       169
weighted avg       0.85      0.85      0.85       169



In [63]:
final_model = grid_search.best_estimator_

In [65]:
import joblib

joblib.dump(final_model,'loan_model.pkl')
joblib.dump(X.columns,'inputs_model.pkl')

['inputs_model.pkl']

In [67]:
%%writefile loan_app_3rd.py
import streamlit as st
import pandas as pd
import joblib

Inputs = joblib.load("loan_model.pkl")
Model = joblib.load("loan_model.pkl")


def prediction(Gender, Married, Dependents, Education, Self_Employed,ApplicantIncome, CoapplicantIncome, Loan_Amount_Term,Credit_History, Property_Area,Equated_Monthly_installment, Balance_Income,Total_Income_log,LoanAmount__log):
    test_df = pd.DataFrame(columns=Inputs)
    test_df.at[0, "Gender"] = Gender
    test_df.at[0, "Married"] = Married
    test_df.at[0, "Dependents"] = Dependents
    test_df.at[0, "Education"] = Education
    test_df.at[0, "Self_Employed"] = Self_Employed
    test_df.at[0, "ApplicantIncome"] = ApplicantIncome
    test_df.at[0, "CoapplicantIncome"] = CoapplicantIncome
    test_df.at[0, "Loan_Amount_Term"] = Loan_Amount_Term
    test_df.at[0, "Credit_History"] = Credit_History
    test_df.at[0, "Property_Area"] = Property_Area
    test_df.at[0, "Equated_Monthly_installment"] = Equated_Monthly_installment
    test_df.at[0, "Balance_Income"] = Balance_Income
    test_df.at[0, "Total_Income_log"] = Total_Income_log
    test_df.at[0, "LoanAmount__log"] = LoanAmount__log
    st.dataframe(test_df)
    result = Model.predict(test_df)[0]
    return result

def main():
    st.title("loan Approval Prediction")
    Gender = st.selectbox("Gender", ['Male','Female'])
    Married = st.selectbox("Married", ['Yes','No'])
    Education = st.selectbox("Education", ['Graduate', 'Not Graduate'])
    Dependents = st.slider("Dependents", min_value=0, max_value=3, value=0, step=1)
    ApplicantIncome = st.slider("ApplicantIncome", min_value=0, max_value=81000, value=0, step=100)
    CoapplicantIncome = st.slider("CoapplicantIncome", min_value=0, max_value=42000, value=0, step=100)
    LoanAmount__log = st.slider("LoanAmount__log", min_value=1, max_value=7, value=0, step=1)
    Total_Income_log=st.slider("Total_Income_log", min_value=5, max_value=12, value=0, step=1)
    Equated_Monthly_installment=st.slider("Equated_Monthly_installment", min_value=0, max_value=10, value=0, step=1)
    Balance_Income=st.slider("Balance_Income", min_value=-1768, max_value=80000, value=0, step=5)
    Loan_Amount_Term = st.slider("Loan_Amount_Term", min_value=12, max_value=480, value=0, step=12)
    Self_Employed = st.selectbox("Self_Employed", ['Yes','No'])
    Credit_History = st.selectbox("Credit_History", [0,1])
    Property_Area = st.selectbox("Property_Area", ['Urban', 'Rural', 'Semiurban'])
    
    
    if st.button("Predict"):
        result = prediction(Gender, Married, Dependents, Education, Self_Employed,ApplicantIncome, CoapplicantIncome, Loan_Amount_Term,
        Credit_History, Property_Area,Equated_Monthly_installment, Balance_Income, Total_Income_log,LoanAmount__log)
        label = ["accepted","unaccepted"]
        st.text(f"The loan is {label[result]}")
if __name__ == '__main__':
    main()

Overwriting loan_app_3rd.py
