In [112]:
import numpy as np
import pandas as pd
from scipy import stats
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier

In [2]:
loan_df_train = pd.read_excel('../data/train.xlsx')

In [3]:
loan_df_train = loan_df_train.drop(['Loan_ID'], axis=1)
loan_df_train.head(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0.0,Graduate,No,5849.0,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1.0,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0.0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0.0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0.0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2.0,Graduate,Yes,5417.0,4196.0,267.0,360.0,1.0,Urban,Y
6,Male,Yes,0.0,Not Graduate,No,2333.0,1516.0,95.0,360.0,1.0,Urban,Y
7,Male,Yes,3+,Graduate,No,3036.0,2504.0,158.0,360.0,0.0,Semiurban,N
8,Male,Yes,2.0,Graduate,No,4006.0,1526.0,168.0,360.0,1.0,Urban,Y
9,Male,Yes,1.0,Graduate,No,12841.0,10968.0,349.0,360.0,1.0,Semiurban,N


In [4]:
loan_df_train['Loan_Status'] = loan_df_train['Loan_Status'].replace(['Y', 'N'], [1, 0])

### Univariate Analysis

In [5]:
def Uni_Stat(df, columns):
    col = ['Count', "Missing", "Unique", "Dtype", "Mean", "Mode", "Min", "25%",
          "Median", "75%", "Max", "Std", "Skew", "Kurt"]
    output_df =pd.DataFrame(columns=col)
    
    for c in columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            output_df.loc[c] = [df[c].count(), df[c].isnull().sum(), df[c].nunique(),
                               df[c].dtype, df[c].mean(), df[c].mode()[0], df[c].min(),
                               df[c].quantile(.25), df[c].median(), df[c].quantile(.75),
                               df[c].max(), df[c].std(), df[c].skew(), df[c].kurt()]
        else:
            output_df.loc[c] = [df[c].count(), df[c].isnull().sum(), df[c].nunique(),
                               df[c].dtype, '-', df[c].mode()[0], '-',
                               '-', '-', '-',
                               '-', '-', '-', '-']
            
    return output_df

In [6]:
un_df = Uni_Stat(loan_df_train, loan_df_train.columns)
un_df

Unnamed: 0,Count,Missing,Unique,Dtype,Mean,Mode,Min,25%,Median,75%,Max,Std,Skew,Kurt
Gender,601,13,2,object,-,Male,-,-,-,-,-,-,-,-
Married,611,3,2,object,-,Yes,-,-,-,-,-,-,-,-
Dependents,599,15,4,object,-,0.0,-,-,-,-,-,-,-,-
Education,614,0,2,object,-,Graduate,-,-,-,-,-,-,-,-
Self_Employed,582,32,2,object,-,No,-,-,-,-,-,-,-,-
ApplicantIncome,614,0,505,float64,5403.459283,2500.0,150.0,2877.5,3812.5,5795.0,81000.0,6109.041673,6.539513,60.540676
CoapplicantIncome,614,0,287,float64,1621.245798,0.0,0.0,0.0,1188.5,2297.25,41667.0,2926.248369,7.491531,84.956384
LoanAmount,592,22,203,float64,146.412162,120.0,9.0,100.0,128.0,168.0,700.0,85.587325,2.677552,10.401533
Loan_Amount_Term,600,14,10,float64,342.0,360.0,12.0,360.0,360.0,360.0,480.0,65.12041,-2.362414,6.673474
Credit_History,564,50,2,float64,0.842199,1.0,0.0,1.0,1.0,1.0,1.0,0.364878,-1.882361,1.548763


### Bivariate

In [7]:
data_table = pd.crosstab(loan_df_train['Property_Area'], loan_df_train['Loan_Status'])
observed_values = data_table.values

In [8]:
data_table

Loan_Status,0,1
Property_Area,Unnamed: 1_level_1,Unnamed: 2_level_1
Rural,69,110
Semiurban,54,179
Urban,69,133


In [9]:
chi_stat, p, ddof, exp_value = stats.chi2_contingency(observed_values)

In [10]:
if p<=0.05:
    print("Reject H0, there is a realtionship between 2 categorical variables")
else:
    print("Retain H0, there is no relatioship between 2 categorical variables")

Reject H0, there is a realtionship between 2 categorical variables


In [11]:
def Relationship_bw_cate_and_Loan_status(df, columns):
    temp = []
    for col in columns:
        data_table = pd.crosstab(loan_df_train[col], loan_df_train['Loan_Status'])
        observed_values = data_table.values
        chi_stat, p, ddof, exp_value = stats.chi2_contingency(observed_values)
        if p<=0.05:
            print(f"Reject H0, there is a realtionship between {col} categorical Loan_status")
        else:
            temp.append(col)
            print(f"Retain H0, there is no relatioship between {col} categorical Loan_status")
    return temp

In [12]:
categorical_columns = list(un_df[un_df.Dtype == 'object'].index)

In [13]:
temp = Relationship_bw_cate_and_Loan_status(loan_df_train, categorical_columns)

Retain H0, there is no relatioship between Gender categorical Loan_status
Reject H0, there is a realtionship between Married categorical Loan_status
Retain H0, there is no relatioship between Dependents categorical Loan_status
Reject H0, there is a realtionship between Education categorical Loan_status
Retain H0, there is no relatioship between Self_Employed categorical Loan_status
Reject H0, there is a realtionship between Property_Area categorical Loan_status


### Dropping Uncorrelated columns and rows

temp variable is from chi square test above

In [14]:
data = loan_df_train.drop(temp, axis=1)

In [15]:
Uni_Stat(data, data.columns)

Unnamed: 0,Count,Missing,Unique,Dtype,Mean,Mode,Min,25%,Median,75%,Max,Std,Skew,Kurt
Married,611,3,2,object,-,Yes,-,-,-,-,-,-,-,-
Education,614,0,2,object,-,Graduate,-,-,-,-,-,-,-,-
ApplicantIncome,614,0,505,float64,5403.459283,2500.0,150.0,2877.5,3812.5,5795.0,81000.0,6109.041673,6.539513,60.540676
CoapplicantIncome,614,0,287,float64,1621.245798,0.0,0.0,0.0,1188.5,2297.25,41667.0,2926.248369,7.491531,84.956384
LoanAmount,592,22,203,float64,146.412162,120.0,9.0,100.0,128.0,168.0,700.0,85.587325,2.677552,10.401533
Loan_Amount_Term,600,14,10,float64,342.0,360.0,12.0,360.0,360.0,360.0,480.0,65.12041,-2.362414,6.673474
Credit_History,564,50,2,float64,0.842199,1.0,0.0,1.0,1.0,1.0,1.0,0.364878,-1.882361,1.548763
Property_Area,614,0,3,object,-,Semiurban,-,-,-,-,-,-,-,-
Loan_Status,614,0,2,int64,0.687296,1,0,0.0,1.0,1.0,1,0.463973,-0.809998,-1.348306


In [16]:
#After dropping few uncorelated columns the only categorical colums left is Married
data = data[data['Married'].isnull() == False] # Dropped few null rows where married in NaN

### Adding Features

In [17]:
data['Loan_Amount_Term_0_to_180'] =(data['Loan_Amount_Term']<=180) & (data['Loan_Amount_Term'] > 0)
data['Loan_Amount_Term_0_to_180'] = data['Loan_Amount_Term_0_to_180'].astype(int)
data['Loan_Amount_Term_180_to_360'] =(data['Loan_Amount_Term']<=360) & (data['Loan_Amount_Term'] > 180)
data['Loan_Amount_Term_180_to_360'] = data['Loan_Amount_Term_180_to_360'].astype(int)
data['Loan_Amount_Term_360_above'] =data['Loan_Amount_Term'] > 360
data['Loan_Amount_Term_360_above'] = data['Loan_Amount_Term_360_above'].astype(int)

In [18]:
# data['Property_Area'].unique()

In [19]:
data['Urban'] = (data['Property_Area'] == 'Urban').astype(int)
data['Rural'] = (data['Property_Area'] == 'Rural').astype(int)
data['Semiurban'] = (data['Property_Area'] == 'Semiurban').astype(int)

In [87]:
data = data.drop(['Loan_Amount_Term', 'Property_Area'], axis = 1)

In [88]:
columns = data.columns

### Converting Categorical to Numerical

In [89]:
for col in categorical_columns:
    if col in data.columns:
        tmp = data[col].unique()
        if len(tmp) < 3:
            data[col] = data[col].replace(tmp, [0, 1])
        else:
            data[col] = data[col].replace(tmp, [i for i in range(len(tmp))])

In [90]:
data = pd.DataFrame(data, columns=columns)

### Using Imputer to replace missing values

In [91]:
impute_it = IterativeImputer()
impute_it.fit(data)

In [92]:
data = impute_it.fit_transform(data)

In [93]:
data = pd.DataFrame(data, columns=columns)

In [94]:
data.head()

Unnamed: 0,Married,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Loan_Status,Loan_Amount_Term_0_to_180,Loan_Amount_Term_180_to_360,Loan_Amount_Term_360_above,Urban,Rural,Semiurban
0,0.0,0.0,5849.0,0.0,138.031817,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,4583.0,1508.0,128.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,3000.0,0.0,66.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,1.0,2583.0,2358.0,120.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,6000.0,0.0,141.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [95]:
df_train, df_test = data.iloc[0:431, :] ,data.iloc[431:, :] 

In [177]:
x = df_train.drop(['Loan_Status'], axis=1)

In [178]:
y = df_train['Loan_Status']

### Model Training

In [240]:
model_lr = ExtraTreeClassifier(max_depth = 4, max_leaf_nodes=170, max_features=8, min_impurity_decrease=0.05)
model_lr.fit(x, y)
y_pred = model_lr.predict_proba(x)[:,1]
y_pred = [(1 if i>0.5 else 0) for i in y_pred]

In [241]:
acc = sum(y_pred == y)/len(y)
print(f"Training Accuracy {acc}")

Training Accuracy 0.8190255220417634


### Testing

In [242]:
x_test = df_test.drop(['Loan_Status'], axis=1)
y_test = df_test['Loan_Status']

In [243]:
# x_test = scaler.fit_transform(x_test)

In [244]:
y_test_pred = model_lr.predict_proba(x_test)[:,1]
y_test_pred = [(1 if i>0.5 else 0) for i in y_test_pred]

In [245]:
test_acc = sum(y_test_pred == y_test)/len(y_test)
print(f"Test Accuracy {test_acc}")

Test Accuracy 0.8555555555555555


### Confusion Matrix

In [163]:
confusion_matrix(y, y_pred)

array([[ 63,  73],
       [  5, 290]])

In [164]:
confusion_matrix(y_test, y_test_pred)

array([[ 29,  27],
       [  3, 121]])