# Home Loan Approval Indicator 

### Life cycle of Machine learning Project
* Understanding the Problem Statement
* Data Collection
* Data Checks to perform
* Exploratory data analysis
* Data Pre-Processing
* Model Training
* Choose best model

#### Problem statement

This project understands how the Home loan approval is affected by other variables such as gender, marital status, number of dependents, income, loan amount, credit history, etc.

## Data Collection

* Dataset Source - https://www.kaggle.com/datasets/rishikeshkonapure/home-loan-approval

### Import Data and Required Packages

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#### Import the data

In [3]:
df = pd.read_csv("data\loan_sanction_train.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [7]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline

In [9]:
numerical_columns = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]
categorical_columns = ["Gender","Married", "Dependents", "Education", "Self_Employed", "Property_Area"]

numerical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

            
categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(sparse=False)),
        ("scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
        ("numerical_pipelines", numerical_pipeline, numerical_columns),
        ("categorical_pipelines", categorical_pipeline, categorical_columns)
])    

In [10]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1, random_state=33)

In [11]:
train_df["Loan_Status"]

445    Y
601    Y
546    N
536    Y
405    Y
      ..
57     N
201    Y
578    Y
391    Y
20     N
Name: Loan_Status, Length: 552, dtype: object

In [28]:
target_column = "Loan_Status"

X_train_df = train_df.drop(columns=[target_column, "Loan_ID"], axis=1)
y_train_df = train_df[target_column]

X_test_df = test_df.drop(columns=[target_column, "Loan_ID"], axis=1)
y_test_df = test_df[target_column]

In [20]:
X_train_arr = preprocessor.fit_transform(X_train_df)
X_test_arr = preprocessor.fit_transform(X_test_df)

In [21]:
y_train_arr = np.array((y_train_df == 'Y').astype(int))

In [22]:
train_arr = np.c_[X_train_arr, y_train_arr]

In [23]:
train_arr = np.column_stack([X_train_arr, y_train_arr])

In [24]:
train_arr.shape

(552, 21)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [3]:
models = {"Logistic Regression" : LogisticRegression(),
                      "Support Vector Machine" : LinearSVC(),
                      "K-Neighbors Classifier" : KNeighborsClassifier(),
                      "Decision Tree Classifier" : DecisionTreeClassifier(),
                      "Gradient Boosting Classifier" : GradientBoostingClassifier(),
                      "Random Forest Classifier" : RandomForestClassifier(),
                      "XGBoost Classifier" : XGBClassifier(),
                      "CatBoost Classifier" : CatBoostClassifier()
                      }

In [6]:
for model in list(models):
    print(models[model])

LogisticRegression()
LinearSVC()
KNeighborsClassifier()
DecisionTreeClassifier()
GradientBoostingClassifier()
RandomForestClassifier()
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
<catboost.core.CatBoostClassifier object at 0x00000221BCB0DE80>
