In [None]:
#-----------------------------------------------------------------------------------------------------------------------------
#
# Loan Eligibility Classification Model Auxillary Functions
#
# Auxillary functions used in the Loan Eligibility Evaluation and Modelling
# Preperation of data sets used by both functions
# Placed in commmon repository

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------
# Rev     By                Description
# 1.2    Richard Brooks     Added some common graph plotting functions
# 1.1    Richard Brooks     Tidy Up
# 1.0    Richard Brooks     Initial Release

sScript = 'Loan History Auxillary Functions'
sVersion = 'v1.2'
sAuthor = 'Richard W Brooks'

print ('Running : ' + sScript + ' : ' + sVersion)

In [None]:
import numpy as np
import pandas as pd
%matplotlib notebook

In [None]:
# Preprocessing of raw data

def df_PreProcess_Data(df):
    
    # Defining Variables in a format for evaluation
    df['Loan_ID'] = df['Loan_ID'].apply(lambda x:int(re.findall('\d+', x)[0]))
    df['Credit_History'] = df['Credit_History'].apply(lambda x:'Yes' if x > 0 else 'No')
    
    # Calculating new independant variables that may give better linear dependancies
    df['c_JointIncome'] = df.apply(lambda x: x['ApplicantIncome'] + x ['CoapplicantIncome'], axis = 1)
    df['c_Loan_per_Month'] = df.apply(lambda x: x['LoanAmount'] / x ['Loan_Amount_Term'], axis = 1) 
    df['c_Loan_per_AppIncome'] = df.apply(lambda x: x['LoanAmount'] / x['ApplicantIncome'] if x['ApplicantIncome'] >0  else np.nan, axis = 1)
    df['c_Loan_per_JointIncome'] = df.apply(lambda x: x['LoanAmount'] / x['c_JointIncome'] if x['c_JointIncome'] >0  else np.nan, axis = 1)

In [None]:
# Function to concert catagorical fileds to boolean
# This finds the unique catagories in a columns
# Creates a new column 'Column_Val' and gives a value of 1 for rows that match 0 for other catagories

def df_CatCol_to_Boolean(df_i, ls_Col_Cat):
    
    # Only consider columns within the dataframe
    ls_Col = list(set(df_i.columns).intersection(ls_Col_Cat))

    df_x = df_i.copy()

    for var in ls_Col:
        cat_list='var'+'_'+var
        cat_list = pd.get_dummies(df_x[var], prefix=var)
        
        del cat_list[cat_list.columns[0]]
        
        df_xt = df_x.join(cat_list)
        df_x = df_xt
        
    df_x = df_x._get_numeric_data()
    
    return df_x

In [None]:
# Function to process the data for model evaluation and running
# This convert catagorical data to columns with Column_Value - 0 / 1
# Normalised the data by scaling to min and max
# The result is independant data fields are all values between 0 and 1 for model evaluation

def df_Prep_Data(df, ls_Col_Cat):
    
    df_data = df.copy()
            
    df_data_p = df_CatCol_to_Boolean(df_data,ls_Col_Cat)
    d = MinMaxScaler().fit_transform(df_data_p)
    df_data_p = pd.DataFrame(d, columns = df_data_p.columns)
    
    return df_data_p

In [None]:
# Function to fill Null Values
# The NA values are filled only for columns in a list
# The data is the most common if it is a catagorical variable and mean for numerical data
# The rows are deleted for any remaining NA enteries - rows with NA in columns excluding those specified

def df_FillNA_Data(df, ls_Col_Sub, ls_Col_Fill_NA, ls_Col_Cat):
    
    df = df[ls_Col_Sub].copy()
    
    # Only consider columns within the dataframe
    ls_Col = list(set(ls_Col_Fill_NA).intersection(ls_Col_Sub))
    
    # Fill NA with most common for catagory varaibles else average 
    for sCol in ls_Col:   
        if sCol in ls_Col_Cat:
            Val = df[sCol].mode()[0]
            df[sCol].fillna(str(Val), inplace = True) 
        else:
            Val = df[sCol].mean()
            df[sCol].fillna(Val, inplace = True) 
    
    df.dropna(inplace = True)
    
    return df

In [None]:
# Function to create correlation heat map from dataframe

def fn_plt_correlation_heat_map(df, sTitle):
    df_corr = df.corr()
    matrix = np.triu(df_corr)

    plt.figure()
    chart = sns.heatmap(df_corr,mask = matrix,vmin=-.25,vmax=.25,cmap='coolwarm',annot=True, fmt='.2f', center= 0, linewidths=2, linecolor='black')
    chart.set_xticklabels(chart.get_xticklabels(), rotation=30, horizontalalignment='right')
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    
    plt.title(sTitle)
    plt.tight_layout()

In [None]:
# Function to create mean values by output column
# So can comapre the strength of each variable

def fn_plt_bar_ratio(df,ls_Col_Out,sTitle):
    df_data_p_m = df.groupby(ls_Col_Out).mean().T
    ls = df_data_p_m.columns
    df_data_p_m['Ratio'] = df_data_p_m[ls[1]] / df_data_p_m[ls[0]]
    #df_data_p_m.sort_values(by=['Ratio'], inplace=True)
    
    df_data_p_m.plot(y=ls, kind="bar")
    
    plt.title(sTitle)
    plt.tight_layout()

In [None]:
# -------------------------------------------------------------------------------------------------------------------------
# End of Script
# -------------------------------------------------------------------------------------------------------------------------