# Project 2 Loan Prediction Dataset Playground
## Ryan Rosiak
## 10/4/21

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import the data and view it
lp_df = pd.read_csv('./Dataset/loan-train.csv', header=0)
lp_df.head(25)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [3]:
# Shape of the data
lp_df.shape

(614, 13)

In [4]:
# Attributes
lp_df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [5]:
# Description of dataset
lp_df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [6]:
# Number of unique values per column
for col in lp_df.columns:
    print(col, ':', lp_df[col].nunique())

Loan_ID : 614
Gender : 2
Married : 2
Dependents : 4
Education : 2
Self_Employed : 2
ApplicantIncome : 505
CoapplicantIncome : 287
LoanAmount : 203
Loan_Amount_Term : 10
Credit_History : 2
Property_Area : 3
Loan_Status : 2


In [8]:
# Check how many NAN values are in each column
lp_df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
# CLEANING DATA
# Fix up some of the data to be of the correct decimal place
# First fix the loan amount in thousands
# lp_df['LoanAmount'] = lp_df['LoanAmount'] * 1000 ******TODO
# Replace the NAN values in each column
lp_df['Gender'] = lp_df['Gender'].fillna('Not Specified')
lp_df['Married'] = lp_df['Married'].fillna('Not Specified')
lp_df['Dependents'] = lp_df['Dependents'].fillna(0)
lp_df['Self_Employed'] = lp_df['Self_Employed'].fillna('No')
lp_df['LoanAmount'] = lp_df['LoanAmount'].fillna(0)
lp_df['Loan_Amount_Term'] = lp_df['Loan_Amount_Term'].fillna('Not Specified')
lp_df['Credit_History'] = lp_df['Credit_History'].fillna(-1)
lp_df.head(25)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,0.0,360,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128000000.0,360,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66000000.0,360,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120000000.0,360,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141000000.0,360,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267000000.0,360,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95000000.0,360,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158000000.0,360,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168000000.0,360,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349000000.0,360,1.0,Semiurban,N


## Preliminary Question 1
## Does marital status determine the range of money that you are applying for?

## Preliminary Question 2
## Does education status determine a successful credit history check?

## Preliminary Question 3
## Does applicant income determine loan amount?

## Preliminary Question 4
## Does property area affect loan status?

## Preliminary Question 5
## Is there a correlation between loan amount and loan amount term?