In [None]:
pip install researchpy

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
from matplotlib.ticker import MaxNLocator           
%matplotlib inline 
import seaborn as sns   

import scipy.stats as ss
from scipy import stats 
from scipy.stats import skew, boxcox_normmax, norm
from scipy.stats import chi2_contingency
from researchpy import crosstab

import warnings
warnings.filterwarnings("ignore")

# Variable Identification


In [None]:
train_data = pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')
test_data = pd.read_csv('../input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv')

In [None]:
train_data[(train_data['Gender'].isnull())]

In [None]:
train_data.shape

In [None]:
train_data.dtypes

In [None]:
train_data.describe()

In [None]:
numerical_columns = train_data.dtypes[train_data.dtypes != 'object'].index
numerical_columns

In [None]:
categorical_columns = train_data.dtypes[train_data.dtypes == 'object'].index
categorical_columns

In [None]:
# Target Variable is Loan Status
# Type of problem is a classification problem

# Univariate Analysis: Numerical Features

In [None]:
# Distribution Plot / Histogram
for col in numerical_columns:
  print("Skewness of ", col,": " , train_data[col].skew());
  print("Kurtosis of ",col,": " , train_data[col].kurtosis());
  print("---------------------------")
  sns.set_style('white');
  plt.figure();
  sns.distplot(train_data[col], fit = norm); 

In [None]:
## Features such as ApplicantIncome & Coapplicant Income are highly skewed (positively), could be normalized using Log Transformation
## LoanAmount is also positively skewed, can be normalized using Log Transformation

In [None]:
# Box Plots
for col in numerical_columns:
  sns.set_style('white');
  plt.figure();
  sns.boxplot(train_data[col]); 

In [None]:
print(train_data['Credit_History'].unique())
print("---"*30)
print(train_data['Loan_Amount_Term'].unique())

In [None]:
## Feature Credit_History must be a categorical feature as it only holds values 1 & 0
## Loan_Amount_Term can also be labeled as a categorical feature
## CoapplicantIncome & ApplicantIncome are a highly disperesed features

# Univariate Analysis: Categorical Features

In [None]:
# Count Plots 
for col in categorical_columns:
  sns.countplot(train_data[col]); 
  plt.figure();

In [None]:
## The data is biased/imbalanced towards males, and most of the loans present in the dataset have been approved
## Loan_ID is an unecessary feature that can be removed

In [None]:
# Converting Loan_Amount_Term & Credit_History to categorical features
train_data['Loan_Amount_Term'] = train_data['Loan_Amount_Term'].astype(object)
train_data['Credit_History'] = train_data['Credit_History'].astype(object)

In [None]:
# Updating numerical and categorical columns
numerical_columns = train_data.dtypes[train_data.dtypes != 'object'].index
categorical_columns = train_data.dtypes[train_data.dtypes == 'object'].index

In [None]:
# Removing Loan_ID from categorical features
categorical_columns = categorical_columns.drop('Loan_ID')

# Bi-Variate Analysis: Continuous vs. Continuous 

In [None]:
def srt_reg(df):
    for i in train_data[[col for col in numerical_columns]]:
      for k in train_data[[col for col in numerical_columns[::-1]]]:
        if i == k:
          continue;
        sns.regplot(x=i, y=k, data=df, color='#e74c3c', line_kws={'color': 'black'}, scatter_kws={'alpha':0.4})
        plt.figure();
     
srt_reg(train_data)

In [None]:
## LoanAmount & ApplicantIncome follow a linear relationship
## Most of the CoapplicantIncome is 0, which must be handled as it makes our data biased, otherwise it is linearly related to LoanAmount

In [None]:
print("Number of rows with 0 CoapplicantIncome: ", train_data.CoapplicantIncome.value_counts()[0])
print("Percentage:", (train_data.CoapplicantIncome.value_counts()[0]/train_data.CoapplicantIncome.count()*100))

In [None]:
# A new feature Has_CoapplicantIncome can be useful

In [None]:
# Segregating Nominal and Ordinal Columns
nominal_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status', 'Credit_History']
ordinal_columns = ['Dependents', 'Loan_Amount_Term']

# Bi-Variate Analysis: Categorical vs. Categorical

In [None]:
# Cross Tabulation between Loan_Status and other categorical features
def cross_tab(col, df):
  cat = df.dtypes[train_data.dtypes == 'object'].index
  for i in cat:
        if i == col:
          continue;
        print(pd.crosstab(index = train_data[i], columns = train_data[col], normalize='index'))
        print("----"*30)

cross_tab('Loan_Status', train_data)

In [None]:
## Married applicants have greater chances for loan approval
## Graduated applicants have greater approval chances
## Applicants with credit history have greater approval chances
## SemiUrban property holders have greater approval chances
## Applicants with 2 Dependents have high chances for approval

In [None]:
# Performing Chi-Square Test between Loan_Status and other features
def chi_sq(col, df):
  cat = df.dtypes[train_data.dtypes == 'object'].index
  for i in cat:
        if i == col:
          continue;
        cross = pd.crosstab(index = train_data[i], columns = train_data[col])
        chisq_res = chi2_contingency(cross)
        print("p - value for test between ", col, " and ", i, " is:", chisq_res[1])
        if chisq_res[1] > 0.05:
          print(col, " and ", i, " are not correlated.")
        print("----"*30)

chi_sq('Loan_Status', train_data)

In [None]:
## Loan_Status is not correlated with features such as Self_Employed, Loan_Amount_Term, Dependents and Gender

# Bi-Variate Analysis: Continuous vs. Categorical

In [None]:
# Box Plots
def srt_box(df):
    for i in df[[col for col in numerical_columns]]:
      for k in df[[col for col in categorical_columns]]:
        plt.figure(figsize=(10,8))
        sns.boxplot( x=k, y=i, data=df)
        plt.figure();


srt_box(train_data)

In [None]:
## There are some male applicants which have high ApplicantIncome, which may be classified as outliers
## Graduate Applicants have high ApplicantIncome
## Self Employed applicants also have higher income
## Some Female & Male co-applicants have exceptionally high CoapplicantIncome, therefore might be outliers
## Male applicants have greater LoanAmount
## Married applicants also have greater LoanAmount than Unmarried applicants
## Loan Amount has a linear relationship with the number of Dependents, as number of dependents increase, loan amount increases
## Graduate Applicants have high LoanAmount

# Handling Missing Values: Categorical Features


In [None]:
null_value_train = pd.DataFrame(train_data[[col for col in categorical_columns]].isnull().sum()).reset_index()
null_value_train = null_value_train.rename(columns = {'index': 'Column Name', 0: 'Number of Null Values'}, inplace = False)
null_value_train['Percentage of Null Values'] = (null_value_train['Number of Null Values']/len(train_data)*100) 
null_value_train.sort_values(by = 'Percentage of Null Values', ascending = False).head(10)

In [None]:
columns_fillnone = ['Credit_History', 'Self_Employed', 'Dependents', 'Loan_Amount_Term', 'Gender', 'Married']

for col in columns_fillnone:
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0])

In [None]:
null_value_train = pd.DataFrame(train_data[[col for col in categorical_columns]].isnull().sum()).reset_index()
null_value_train = null_value_train.rename(columns = {'index': 'Column Name', 0: 'Number of Null Values'}, inplace = False)
null_value_train['Percentage of Null Values'] = (null_value_train['Number of Null Values']/len(train_data)*100) 
null_value_train.sort_values(by = 'Percentage of Null Values', ascending = False).head()

# Handling Missing Values: Numerical Features

In [None]:
null_value_train = pd.DataFrame(train_data[[col for col in numerical_columns]].isnull().sum()).reset_index()
null_value_train = null_value_train.rename(columns = {'index': 'Column Name', 0: 'Number of Null Values'}, inplace = False)
null_value_train['Percentage of Null Values'] = (null_value_train['Number of Null Values']/len(train_data)*100) 
null_value_train.sort_values(by = 'Percentage of Null Values', ascending = False).head()

In [None]:
# Null Values in LoanAmount
## Loan amount is directly proportional with ApplicantIncome
## Loan amount is directly proportional with number of dependents
## Married couples have high LoanAmount
## Male Applicants have higher LoanAmount
## Graduate Applicants have hight LoanAmount
null_loanamt = train_data[train_data['LoanAmount'].isnull()]

In [None]:
# ChiSquared for LoanAmount
def cramers_v(x, y):
    import scipy.stats as ss
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))



for i in categorical_columns: 
  print(i, ":")
  print(cramers_v(train_data['LoanAmount'], train_data[i]))
  print("---"*15)

print("***"*30)

for i in numerical_columns: 
  print(i, ":")
  print(cramers_v(train_data['LoanAmount'], train_data[i]))
  print("---"*15)

In [None]:
## LoanAmount is correlated strongly with Credit_History, Dependents, Applicant Income and Property_Area

In [None]:
train_data['LoanAmount'] = train_data.groupby('Dependents')['LoanAmount'].transform(lambda x: x.fillna(x.median()))

In [None]:
null_value_train = pd.DataFrame(train_data[[col for col in numerical_columns]].isnull().sum()).reset_index()
null_value_train = null_value_train.rename(columns = {'index': 'Column Name', 0: 'Number of Null Values'}, inplace = False)
null_value_train['Percentage of Null Values'] = (null_value_train['Number of Null Values']/len(train_data)*100) 
null_value_train.sort_values(by = 'Percentage of Null Values', ascending = False).head()