In [2]:
# Dependencies

from utils import *

In [3]:
# Dataset

d_raw_df = pd.read_csv('../data/loan_approval_dataset.csv')
d_raw_df.columns = d_raw_df.columns.str.strip()  # removes leading/trailing whitespace
d_raw_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
# Overview of datatset

print("Dataset shape:", d_raw_df.shape)
d_raw_df.info()

Dataset shape: (4269, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   no_of_dependents          4269 non-null   int64 
 2   education                 4269 non-null   object
 3   self_employed             4269 non-null   object
 4   income_annum              4269 non-null   int64 
 5   loan_amount               4269 non-null   int64 
 6   loan_term                 4269 non-null   int64 
 7   cibil_score               4269 non-null   int64 
 8   residential_assets_value  4269 non-null   int64 
 9   commercial_assets_value   4269 non-null   int64 
 10  luxury_assets_value       4269 non-null   int64 
 11  bank_asset_value          4269 non-null   int64 
 12  loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


#### Observations

1. 4269 samples
2. 11 features

In [5]:
# Initial statistical survey of data

d_raw_df.describe(include='all')

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
count,4269.0,4269.0,4269,4269,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269
unique,,,2,2,,,,,,,,,2
top,,,Graduate,Yes,,,,,,,,,Approved
freq,,,2144,2150,,,,,,,,,2656
mean,2135.0,2.498712,,,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0,
std,1232.498479,1.69591,,,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0,
min,1.0,0.0,,,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0,
25%,1068.0,1.0,,,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0,
50%,2135.0,3.0,,,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0,
75%,3202.0,4.0,,,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0,


#### Observations

1. All categorical features and target var are binary
2. Minimum value in feature—*residential_assets_value*—is negative, which DOES NOT make sense

In [6]:
# Addressing potential issue with data entry

d_raw_df[d_raw_df['residential_assets_value'] < 0]
# d_raw_df[d_raw_df['residential_assets_value'] > 0]['residential_assets_value'].min()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
59,60,4,Not Graduate,Yes,5500000,18200000,16,797,-100000,4900000,18600000,4800000,Approved
196,197,4,Not Graduate,Yes,400000,1500000,2,669,-100000,600000,900000,500000,Approved
559,560,2,Graduate,Yes,200000,500000,6,885,-100000,0,300000,200000,Rejected
702,703,4,Graduate,Yes,6300000,23900000,6,899,-100000,11400000,20600000,6700000,Approved
737,738,2,Graduate,Yes,900000,2500000,16,458,-100000,100000,3200000,1100000,Rejected
784,785,0,Graduate,No,5000000,14400000,2,761,-100000,7300000,12600000,4500000,Approved
904,905,2,Graduate,No,4100000,14900000,12,571,-100000,5200000,13000000,3400000,Approved
1089,1090,3,Graduate,No,5100000,11000000,6,336,-100000,5800000,11600000,7500000,Rejected
1163,1164,2,Graduate,No,4500000,9100000,18,593,-100000,600000,12400000,2500000,Approved
1350,1351,5,Graduate,No,4000000,13700000,6,496,-100000,1400000,15800000,3700000,Rejected


#### Observations

Seems like all the negative values in *residential_assets_value* are misentries, since the minimum positive value in the column is the same value. Addressing the issue by making all those values positive.

In [7]:
# Cleaning the 'residential_assets_value' column

d_cleaned_df = d_raw_df.copy()
d_cleaned_df['residential_assets_value'] = d_cleaned_df['residential_assets_value'].abs()

In [8]:
# Looking for null/NaN values

d_cleaned_df.isnull().sum().sort_values(ascending=False)

loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [9]:
# Export cleaned df

d_cleaned_df.to_pickle('../data/cleaned_data.pkl')