In [None]:
#-----------------------------------------------------------------------------------------------------------------------------
#
# Loan Eligibility Data Evaluation
#
# The script analyses the data relating to an applicant is eligible for a loan.
# This based on a number of catagorical and continous data fields

# The intention is to understand and plot the data, values, completeness and correlations
# This is with a view to defining the type of model used and key indicators to consider in model design

# The final result is a Main Summary of the Data Considering 
# ... Fraction of Null Values
# ... Ratio of Accepted to Refused Customers
# ... (Raw Singular) Correlation of the Output to Field
# ... The Mean of Data for Approved Clients
# ... The Mean and Standard Deviation

# NOTE : This file has not been sufficiently commented
#-----------------------------------------------------------------------------------------------------------------------------

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------
# Rev     By                Description
# 1.0    Richard Brooks     Initial Release

sScript = 'Loan History Data Evaluation'
sVersion = 'v1.0'
sAuthor = 'Richard W Brooks'

print ('Running : ' + sScript + ' : ' + sVersion)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import re

In [None]:
run fn_Aux_Loan_History.ipynb

In [None]:
# Load the data
df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')
df_train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')

In [None]:
# Check the data
df_train.head(n=5)

In [None]:
# And The Types
df_train.dtypes

In [None]:
# -------------------------------------------------------------------------------
# Analysis of the data
# -------------------------------------------------------------------------------

In [None]:
# Set the data set to analyse
df_data = df_train.copy()
df_data.dtypes

In [None]:
# -------------------------------------------------------------------------------
# Translate the Data
# -------------------------------------------------------------------------------

In [None]:
# Translate the Output to Boolean
df_data['Loan_Status'] = (df_data['Loan_Status']=='Y').astype(int)

# Change the format of fields and create calculated parameters for inputs
df_PreProcess_Data(df_data)

In [None]:
# Split The Data
ls_Col_Out = ['Loan_Status']
ls_Col_Cat = ['Dependents','Education', 'Property_Area','Gender','Married','Self_Employed', 'Credit_History']
#ls_Col_Bool = ['Credit_History']
ls_Col_Val = ['ApplicantIncome', 'CoapplicantIncome', 
            'LoanAmount', 'Loan_Amount_Term','Loan_ID']
ls_Col_Val = list(set(df_data.columns) - (set(ls_Col_Cat + ls_Col_Out)))

In [None]:
sCol_Out = ls_Col_Out[0]
df_data[sCol_Out].value_counts()/len(df_data)

In [None]:
# Investigation of NULL values and fraction approved

#Calculate percentage of total NULL values - with and without approval
ls_df = []
ls_Val = df_data[sCol_Out].unique()
iRow = df_data.shape[0]

for sVal in ls_Val:
    df_sub = df_data[df_data[sCol_Out]==sVal].isna().sum()/iRow
    ls_df.append(df_sub)
    
df_data_ns = pd.concat(ls_df,axis = 1)
df_data_ns['Total'] = df_data_ns.sum(axis=1)
df_data_ns.sort_values(by=['Total'], inplace=True)
df_data_ns

In [None]:
plt.figure()
df_data_ns[ls_Val].plot.bar(stacked = True)
plt.tight_layout()

In [None]:
ls_Col_Null = df_data_ns[df_data_ns['Total']>0].index

In [None]:
# Look for combinations of nulls

df_data['Null_Col'] = df_data.isnull().sum(axis=1)
df_data.sort_values(by=['Null_Col'], inplace=True)
df_data.reset_index(drop=True, inplace=True)

df_data_nl = df_data[ls_Col_Null][df_data['Null_Col']>0]

sns.heatmap(df_data_nl.isnull())

In [None]:
df_data_nl.isnull().sum()

In [None]:
# Look at numerical value spread

In [None]:
# Look at catagorical data
#for sCat in ls_Col_Cat:
#    print(df_data[sCat].value_counts()/len(df_data))
#    print()

In [None]:
df_data_p = df_Prep_Data(df_data,ls_Col_Cat)
df_data_p.columns

In [None]:
df_data_p_m = df_data_p.groupby(ls_Col_Out).mean().T

ls = df_data_p_m.columns

df_data_p_m['Ratio'] = df_data_p_m[ls[1]] / df_data_p_m[ls[0]]

df_data_p_m.sort_values(by=['Ratio'], inplace=True)

In [None]:
df_data_p_m.plot(y=ls, kind="bar")
plt.tight_layout()

In [None]:
df_data_p_m.plot(y="Ratio", kind="bar")
plt.tight_layout()

In [None]:
# Look at the spread of the normalised data - scatter plot
# chart = sns.boxplot(x = 'Cat', y ='BSUoS Price (£/MWh Hour)', palette = m_hueband, data = df_TS)
# Think this will requre a melt first?

In [None]:
#df_data_p[ls_Col_Val + ls_Col_Out].describe()

In [None]:
# Melt the values and output
df_data_pV_L = pd.melt(df_data_p[ls_Col_Val + ls_Col_Out],id_vars=ls_Col_Out, var_name='Field', value_name='Val')
sns.catplot(x = "Val", y = "Field", hue = "Loan_Status", kind = "violin", data = df_data_pV_L)

In [None]:
sns.pairplot(df_data_p[ls_Col_Val + ls_Col_Out], diag_kind = 'hist', hue = ls_Col_Out[0])

In [None]:
df_corr = df_data_p.corr()
df_corr_p = df_data[ls_Col_Val + ls_Col_Out].corr()
matrix = np.triu(df_corr)

In [None]:
plt.figure()
chart = sns.heatmap(df_corr, annot=False,yticklabels=False,mask = matrix,vmin=-.5,vmax=.5,center=0,cmap="RdBu")
chart.set_xticklabels(chart.get_xticklabels(), rotation=30, horizontalalignment='right')
plt.tight_layout()
plt.show()

In [None]:
# Main Summary of the Data Considering 
# ... Fraction of Null Values
# ... Ratio of Accepted to Refused Customers
# ... (Raw Singular) Correlation of the Output to Field
# ... The Mean of Data for Approved Clients
# ... The Mean and Standard Deviation

df_data_p_s = pd.concat([df_data_nl.isnull().sum()/len(df_data_nl),
           df_data_p_m['Ratio'],
           df_corr['Loan_Status'].T,
           df_data_p_m[1.0],
           df_data_p.mean(),
           df_data_p.std()], axis = 1,sort = True)

df_data_p_s.columns = ['null','ratio','out cor','data_frac','mean','std']

df_data_p_s

In [None]:
# -------------------------------------------------------------------------------------------------------------------------
# End of Script
# -------------------------------------------------------------------------------------------------------------------------