In [144]:
import numpy as np
import pandas as pd
from scipy.stats import mode 



```
Feature	Description
Loan_ID	Unique Loan ID
Gender	Male/Female
Married	Applicant Married (Y/N)
Dependents	Number of dependents
Education	Graduate/Under Graduate
Self_Employed	Self employed (Y/N)
ApplicantIncome	Income of the applicant
CoapplicantIncome	Income of the co-applicant
LoanAmount	Loan amount in thousands
LoanAmountTerm	Term of loan in months
Credit_History	credit history meets guidelines}
Property_Area	Urban /Semi-Urban /Rural
Loan_Status	Loan approved (Y/N)
```



In [145]:
df =  pd.read_csv('sample_data/loan_analysis.csv')
df.head(10)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [147]:
categorical_var = df.select_dtypes(include='object')
categorical_var.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,Urban,Y


In [148]:
numerical_var = df.select_dtypes(include = 'number')
numerical_var.head(3)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849,0.0,,360.0,1.0
1,4583,1508.0,128.0,360.0,1.0
2,3000,0.0,66.0,360.0,1.0


In [149]:
# Checking for different type of value a feature contains
print(f'Property area ->  {df.Property_Area.unique()}')
print(f'Loan_Status ->  {df.Loan_Status.unique()}')
print(f'Self_Employed ->  {df.Self_Employed.unique()}')
print(f'Education ->  {df.Education.unique()}')
print(f'Gender ->  {df.Gender.unique()}')


Property area ->  ['Urban' 'Rural' 'Semiurban']
Loan_Status ->  ['Y' 'N']
Self_Employed ->  ['No' 'Yes' nan]
Education ->  ['Graduate' 'Not Graduate']
Gender ->  ['Male' 'Female' nan]


In [None]:
#Drop Load_ID col
banks = df.drop(columns = 'Loan_ID')
banks.isnull().sum()

In [187]:
bank_mode = banks.mode()

In [181]:
# Filling NaN with each feature's mode
for column in banks.columns:
  banks[column].fillna(banks[column].mode()[0],inplace = True)

In [188]:
banks.head(2)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,120.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [204]:
# Now let's check the loan amount of an average person based on 'Gender', 'Married', 'Self_Employed'. This will give a basic idea of the average loan amount of a person

avg_loan_amount = banks.pivot_table( index =['Gender', 'Married', 'Self_Employed'],values='LoanAmount')
# or 
# banks.groupby(['Gender', 'Married', 'Self_Employed']).agg({"LoanAmount" : "mean"})
avg_loan_amount

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LoanAmount
Gender,Married,Self_Employed,Unnamed: 3_level_1
Female,No,No,114.768116
Female,No,Yes,125.272727
Female,Yes,No,133.714286
Female,Yes,Yes,282.25
Male,No,No,129.508621
Male,No,Yes,180.588235
Male,Yes,No,152.60815
Male,Yes,Yes,167.42


In [215]:
# Now let's check the percentage of loan approved based on a person's employment type.
loan_approved_se = ((banks.Self_Employed == 'Yes') & (banks.Loan_Status == 'Y')).sum()
loan_approved_nse = ((banks.Self_Employed == 'No') & (banks.Loan_Status == 'Y')).sum()

In [219]:
Loan_Status_Count = banks.Loan_Status.count()

In [221]:
percentage_se = (loan_approved_se / Loan_Status_Count) * 100
percentage_se 

9.120521172638437

In [222]:
percentage_nse = (loan_approved_nse / Loan_Status_Count) * 100
percentage_nse 

59.60912052117264

In [225]:
# A government audit is happening real soon! So the company wants to find out those applicants with long loan amount term.

# Loan_Amount_Term -> months to a year

banks.Loan_Amount_Term = banks.Loan_Amount_Term.apply(lambda x : x / 12.0)
banks.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,120.0,30.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,30.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,30.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,30.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,30.0,1.0,Urban,Y


In [229]:
# Find the number of applicants having loan amount term greater than or equal to 25 years and store them in a variable called 'big_loan_term'..

big_loan_term = (banks.Loan_Amount_Term >= 25).sum()
print(big_loan_term)

554


In [237]:
# Average income of an applicant and the average loan given to a person based on their income.

loan_groupby = banks.groupby('Loan_Status')


In [233]:

loan_groupby[['ApplicantIncome','Credit_History']]
loan_groupby

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9f943ecf60>

In [234]:
mean_values = loan_groupby.mean()
mean_values

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
Loan_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N,5446.078125,1877.807292,149.432292,28.713542,0.572917
Y,5384.06872,1504.516398,143.661137,28.452607,0.983412
