# Loan Approval Prediction Model

Using the given dataset, we have to predict weather the loan will be approved or not.
Dataset: https://www.kaggle.com/altruistdelhite04/loan-prediction-problem-dataset

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline

## Loading dataset

In [None]:
path = "/kaggle/input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv"
data= pd.read_csv(path)
data

In [None]:
data.describe()

In [None]:
data.info()

## Data preprocessing

### Dealing with null values

In [None]:
data.isnull().sum()

In [None]:
sns.countplot(x = 'Gender', data = data)

In [None]:
sns.countplot(x = 'Married', data = data)

In [None]:
sns.countplot(x = 'Self_Employed', data = data)

In [None]:
sns.countplot(x = 'Dependents', data = data)

In [None]:
sns.countplot(x = 'Credit_History', data = data)

In [None]:
sns.countplot(x = 'Loan_Amount_Term', data = data)

In [None]:
# Filling with mode would be best

data['Gender'] = data["Gender"].fillna(data['Gender'].mode()[0])
data['Married'] = data["Married"].fillna(data['Married'].mode()[0])
data['Dependents'] = data["Dependents"].fillna(data['Dependents'].mode()[0])
data['Self_Employed'] = data["Self_Employed"].fillna(data['Self_Employed'].mode()[0])
data['Credit_History'] = data["Credit_History"].fillna(data['Credit_History'].mode()[0])
data['Loan_Amount_Term'] = data["Loan_Amount_Term"].fillna(data['Loan_Amount_Term'].mode()[0])


In [None]:
plt.hist(data['LoanAmount'], bins = 40 )

In [None]:
# Since there are outliers, Median would be best for filling

data['LoanAmount'] = data["LoanAmount"].fillna(data['LoanAmount'].median())


In [None]:
data.isnull().sum()

## Exploratory data analysis

In [None]:
data['Loan_Status'].replace('N',0,inplace=True)
data['Loan_Status'].replace('Y',1,inplace=True)

In [None]:
sns.countplot(x = 'Loan_Status', data = data)

In [None]:
sns.countplot(x='Credit_History', hue='Loan_Status', data=data)

In [None]:
sns.countplot(x='Gender', hue='Loan_Status', data=data)

In [None]:
sns.countplot(x='Married', hue='Loan_Status', data=data)

In [None]:
sns.countplot(x='Self_Employed', hue='Loan_Status', data=data)

In [None]:
sns.countplot(x='Dependents', hue='Loan_Status', data=data)

In [None]:
data.info()

In [None]:
sns.countplot(x='Education', hue='Loan_Status', data=data)

In [None]:
sns.countplot(x='Property_Area', hue='Loan_Status', data=data)

## Removing useless features

In [None]:
data['Total_Income'] = data['ApplicantIncome'] + data['CoapplicantIncome']


In [None]:
data = data.drop(columns="Loan_ID", axis=1)

In [None]:
plt.figure(figsize=(6,6))
sns.violinplot(x="Loan_Status",y="LoanAmount", data=data)

In [None]:
plt.figure(figsize=(6,8))
sns.violinplot(x="Loan_Status",y="CoapplicantIncome", data=data)

In [None]:
data = data.drop(columns="Dependents", axis=1)
data = data.drop(columns="CoapplicantIncome", axis=1)
data = data.drop(columns="ApplicantIncome", axis=1)

## Labelling Categorical Features

In [None]:
from sklearn.preprocessing import LabelEncoder

features = ['Gender',"Married","Education",'Self_Employed',"Property_Area"]
encode = LabelEncoder()
for f in features:
    data[f] = encode.fit_transform(data[f])

In [None]:
data

In [None]:
x = data.drop(columns="Loan_Status", axis=1)
y = data['Loan_Status']

## Training the model

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
loreg = LogisticRegression(C = 0.3)

loreg.fit(x_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
print("Train Accuracy: ", loreg.score(x_cv, y_cv) * 100)

In [None]:
path = "/kaggle/input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv"
testdata= pd.read_csv(path)

x_test = data.drop(columns="Loan_Status", axis=1)
y_test = data['Loan_Status']

In [None]:
from sklearn.model_selection import cross_val_score
print("Test Accuracy:", loreg.score(x_test, y_test)*100)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = loreg.predict(x_test)
conf = confusion_matrix(y_test, y_pred)
sns.heatmap(conf, annot=True)

## Observation 

We can predict Loan approval with accuracy of around 80%.