
Loan  Prediction System  is  a  software  which  checks 
the  eligibility  of  a  particular  customer  who  is  capable  of 
paying loan or not. This system checks various parameters 
such  as  customer’s  martial  status,  income,  expenditure 
and  various  factors.  This  process  is  applied  for  many 
customers of trained data set. By considering these factors 
a required model is built. This model is applied on the test 
data  set  for getting required output.  The  output generated 
will  be  in  the  form  of  yes  or  no.  Yes  indicates  that  a 
particular  customer  is  capable  of  paying  loan  and  no 
indicates  that  the  particular  customer  is  not  capable  of 
paying loan. Based on these factors we can approve loans 
for customers


### Assumption
- Independence of Applications: This means that the decision for one loan application does not influence the decision for another. 
- Normal Distribution: Some statistical models assume that variables related to loan approval (e.g., income, credit scores) follow a normal distribution.
- Independence of Variables: Many statistical models assume that predictor variables (e.g., income, credit score) are independent of each other. In reality, there may be correlations between these variables (e.g., income and education level), and you should be aware of multicollinearity issues.
- Binary Outcome: Loan approval models often assume that the outcome variable is binary (approved or denied)
- Equal Loan Performance: Some models assume that loans with similar characteristics will have similar performance (e.g., default rates). However, loan performance can vary even among loans with similar attributes.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import  metrics
from miditk.smf import MidiFileWriter

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
loan=pd.read_csv("loan.csv")


In [3]:
loan.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [4]:
loan.shape

(614, 13)

In [5]:
loan.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
loan=loan.drop("Loan_ID",axis=1)


In [7]:
loan.shape

(614, 12)

In [8]:
for col in loan.describe(include=object).columns:
    print(col)
    print(loan[col].unique())
    print('--'*40)

Gender
['Male' 'Female' nan]
--------------------------------------------------------------------------------
Married
['No' 'Yes' nan]
--------------------------------------------------------------------------------
Dependents
['0' '1' '2' '3+' nan]
--------------------------------------------------------------------------------
Education
['Graduate' 'Not Graduate']
--------------------------------------------------------------------------------
Self_Employed
['No' 'Yes' nan]
--------------------------------------------------------------------------------
Property_Area
['Urban' 'Rural' 'Semiurban']
--------------------------------------------------------------------------------
Loan_Status
['Y' 'N']
--------------------------------------------------------------------------------


In [9]:

loan.dtypes[loan.dtypes!=object].index

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [10]:
loan=loan.drop_duplicates() # drop duplicates entry
loan.shape

(614, 12)

In [11]:
loan.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
loan[loan.isna().any(axis=1)] # all missing vlaue in my data set
# loan.shape


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
11,Male,Yes,2,Graduate,,2500,1840.0,109.0,360.0,1.0,Urban,Y
16,Male,No,1,Not Graduate,No,3596,0.0,100.0,240.0,,Urban,Y
19,Male,Yes,0,Graduate,,2600,3500.0,115.0,,1.0,Urban,Y
23,,Yes,2,Not Graduate,No,3365,1917.0,112.0,360.0,0.0,Rural,N
...,...,...,...,...,...,...,...,...,...,...,...,...
592,,No,3+,Graduate,Yes,9357,0.0,292.0,360.0,1.0,Semiurban,Y
597,Male,No,,Graduate,No,2987,0.0,88.0,360.0,0.0,Semiurban,N
600,Female,No,3+,Graduate,,416,41667.0,350.0,180.0,,Urban,N
601,Male,Yes,0,Not Graduate,,2894,2792.0,155.0,360.0,1.0,Rural,Y


In [13]:
loan[loan["Married"].isna()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
104,Male,,,Graduate,No,3816,754.0,160.0,360.0,1.0,Urban,Y
228,Male,,,Graduate,No,4758,0.0,158.0,480.0,1.0,Semiurban,Y
435,Female,,,Graduate,No,10047,0.0,,240.0,1.0,Semiurban,Y


In [24]:
s=loan[["Married","Education","Self_Employed","ApplicantIncome","LoanAmount"]]
s.sort_values(by="Married",ascending=True)
subset = s[(s['Married'] == "No") & ((s['Education'] == 'Graduate') & (s['Self_Employed'] == "No"))]
# loan['Married'].fillna('No', inplace=True)
# # print(loan.shape)# we observe a pattern most of not married people are graduate, not self employed treat as (mar)
# loan.isnull().sum()

Married            136
Education          136
Self_Employed      136
ApplicantIncome    136
LoanAmount         133
dtype: int64

In [None]:
sns.countplot( x="Gender",data=loan)5

In [None]:
loan[loan["Gender"].isna()] # most of female is not married,with zero coapplicant income no credit history with educated
# means these female applying first time of loan about more than 95 percent of our has this property

In [None]:
s=loan[["Gender","Married"]]
mask = (s['Gender'].isnull()) & (s['Married'] == 'Yes')
s.loc[mask, 'Gender'] = 'Male'
# # Fill missing "Gender" values with "Female" for other cases
s['Gender'].fillna('Female', inplace=True)
    
# subset = s[(s['Married'] == "No") & ((s['Education'] == 'Graduate') )& ((s['CoapplicantIncome'] ==0) )& ((s['Credit_History'] ==1) )]
# subset.sort_values(by="Gender",ascending=True).head(50)

In [None]:

loan["Gender"]=s["Gender"]
loan["Married"]=s["Married"]
loan.isnull().sum()

In [None]:
loan[loan["Credit_History"].isna()]
# sns.countplot( x="Credit_History",data=loan)

In [None]:
sns.countplot( x="Credit_History",data=loan)

In [None]:
# s=loan[["Dependents","Self_Employed","LoanAmount"]]
sns.countplot(x="Self_Employed",data=loan)

In [None]:
# replacing the value of 3+ to 4
loan= loan.replace(to_replace='3+', value=4)

In [None]:
a=int(loan['Credit_History'].mode())
b=int(loan['Loan_Amount_Term'].mode())
c=loan["Self_Employed"].mode()

# most frequent value is used to fill nan value in credit history
# most frequent value is used to fill nan value in Loan_Amount_Term
print(c)
loan['Credit_History'].fillna(a, inplace=True)
loan['Loan_Amount_Term'].fillna(b, inplace=True)
loan["Self_Employed"].fillna("No", inplace=True)



loan.isnull().sum() 

In [None]:
# iterative imputation

sns.scatterplot(x='LoanAmount',y='ApplicantIncome',data=loan,hue=("Dependents"))


In [None]:
loan["Dependents"].value_counts()

In [None]:

sns.pairplot(loan,hue='Loan_Status',palette='viridis')

In [None]:
pd.crosstab(loan["Credit_History"],loan["Loan_Status"],margins=True)

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(loan)
# sns.boxplot(x='parental level of education',y='math score',data=df,hue='gender',width=0.3)

In [None]:
loan["ApplicantIncome"].hist()

In [None]:



loan["CoapplicantIncome"].hist()





In [None]:
loan.boxplot(column='ApplicantIncome',by='Education')

In [None]:
loan.boxplot(column='LoanAmount')


In [None]:
loan['LoanAmount'].hist()

In [None]:
loan["loanAmount_log"]=np.log(loan["LoanAmount"])
loan["loanAmount_log"].hist()

In [None]:
loan.isnull().sum()

In [None]:
loan["Dependents"].count()
# sns.countplot(x="Dependents",data=loan)
a=loan["Dependents"].fillna(0).astype(int)
a.mean()


In [None]:
loan["Totalincome"]=loan["ApplicantIncome"]+loan["CoapplicantIncome"]
loan["Totalincome_log"]=np.log(loan["Totalincome"])

In [None]:
loan["Totalincome_log"].hist()

In [None]:
plt.figure(figsize=(100,50))
sns.set(font_scale=5)
plt.subplot(331)
sns.countplot(x=loan["Gender"],hue=loan["Loan_Status"])

plt.subplot(332)
sns.countplot(x=loan["Married"],hue=loan["Loan_Status"])
plt.subplot(333)
sns.countplot(x=loan["Education"],hue=loan["Loan_Status"])
plt.subplot(334)
sns.countplot(x=loan["Self_Employed"],hue=loan["Loan_Status"])
plt.subplot(335)
sns.countplot(x=loan["Property_Area"],hue=loan["Loan_Status"])



In [None]:
# convert categorical columns to numerical values
#  label encoding
loan.replace({'Married':{'No':0,'Yes':1},'Gender':{'Male':1,'Female':0},'Self_Employed':{'No':0,'Yes':1},
                      'Property_Area':{'Rural':0,'Semiurban':1,'Urban':2},'Loan_Status':{'N':0,'Y':1},'Education':{'Graduate':1,'Not Graduate':0}},inplace=True)

In [None]:
loan

In [None]:
fig,ax=plt.subplots(figsize=(55,38))
sns.heatmap(data=loan.corr().round(2),annot=True)
plt.show()

In [None]:
loan.columns

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression



In [None]:
linear_reg=LinearRegression()
imp = IterativeImputer(estimator=linear_reg,missing_values=np.nan,max_iter=5,verbose=2,random_state=0)


In [None]:
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
# # from sklearn.linear_model import KNNImputer
# from sklearn.impute import KNNImputer
# import numpy as np

# # Create a sample dataset with missing values

# knn_imputer = KNNImputer(n_neighbors=3)

# # Perform imputation
# imputed_data = knn_imputer.fit_transform(loan)
# Initialize the iterative imputer with a logistic regression estimator
# logisticreg = KNNImputer(n_neighbors=2)
# imp = IterativeImputer(estimator=logisticreg,missing_values=np.nan,max_iter=5,verbose=2,imputation_order='roman',random_state=0)

# Fit the imputer on your dat
# imputer.fit(loan)

# Impute missing values
imputed_data = imp.fit_transform(loan)

# print("Original data:\n", loan)
# print("Imputed data:\n", imputed_data)
loan_st=pd.DataFrame(imputed_data)


In [None]:
loan_st

In [None]:
# loan_st
new_column_names=['Gender', 'Married', 'Dependents', 'Education','Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome','LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'loanAmount_log', 'Totalincome', 'Totalincome_log']
loan_st.columns = new_column_names

In [None]:
# specify input and output attributes
X = loan_st.drop(columns=['Loan_Status'], axis=1)
y =loan_st['Loan_Status']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# classify function
from sklearn.model_selection import cross_val_score
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy is", model.score(x_test, y_test)*100)
    # cross validation - it is used for better validation of model
    # eg: cv-5, train-4, test-1
    score = cross_val_score(model, x, y, cv=5)
    print("Cross validation is",np.mean(score)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

# Confusion Matrix
A confusion matrix is a summary of prediction results on a classification problem. The number of correct and incorrect predictions are summarized with count values and broken down by each class. It gives us insight not only into the errors being made by a classifier but more importantly the types of errors that are being made.

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:

sns.heatmap(cm, annot=True)

In [None]:
gre = sns.regplot(x= 'Credit_History', y= 'Loan_Status', data= loan_st, logistic= True).set_title("pregnancies Log Odds Linear Plot")

In [None]:
gre = sns.regplot(x= 'ApplicantIncome', y= 'Loan_Status', data= loan_st, logistic= True).set_title("pregnancies Log Odds Linear Plot")

In [None]:
gre = sns.regplot(x= 'LoanAmount', y= 'Loan_Status', data= loan_st, logistic= True).set_title("pregnancies Log Odds Linear Plot")

In [None]:
gre = sns.regplot(x= 'CoapplicantIncome', y= 'Loan_Status', data= loan_st, logistic= True).set_title("pregnancies Log Odds Linear Plot")

In [None]:
gre = sns.regplot(x= 'Loan_Amount_Term', y= 'Loan_Status', data= loan_st, logistic= True).set_title("pregnancies Log Odds Linear Plot")

In [None]:
loan_st