In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loan Defaulter Analysis

In this chance I will do some analysis and make a model to predict the loan application accepted or not. The most important question of this case is **"How can we know a person's loan application accepted or not?** . 

In [None]:
## visualization 
import matplotlib.pyplot as plt
import seaborn as sns

## preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

## testing 
from scipy import stats

## model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

##pipeline
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [None]:
training_path = '../input/loan-prediction-based-on-customer-behavior/Training Data.csv'
test_path = '../input/loan-prediction-based-on-customer-behavior/Test Data.csv'

## Data Loading

In [None]:
trainData = pd.read_csv(training_path, index_col='Id')
trainData.head()

In [None]:
testData = pd.read_csv(test_path, index_col='ID')
testData.head()

In [None]:
trainData.info()

In [None]:
trainData.describe()

In [None]:
trainData.isnull().sum()

In [None]:
trainData.rename(columns={'Married/Single':'MaritalStatus'}, inplace=True)
trainData.head()

## Exploratory Data Analysis

In [None]:
trainData.STATE.value_counts()

In [None]:
label = trainData.STATE.value_counts().index
count = trainData.STATE.value_counts().values

fig, ax = plt.subplots(figsize=(6, 6))
ax.pie(count, labels=label, autopct='%1.1f%%', shadow=True)
ax.axis('equal')
plt.show()


In [None]:
trainData.CITY.value_counts()

In [None]:
trainData.Profession.value_counts()

In [None]:
ax = sns.countplot(x=trainData.MaritalStatus, palette='muted')
ax.bar_label(container=ax.containers[0])

In [None]:
ax=sns.countplot(x=trainData.House_Ownership, palette='Set2')
ax.bar_label(container=ax.containers[0])

In [None]:
ax = sns.countplot(x=trainData.Car_Ownership, palette='Set2')
ax.bar_label(container=ax.containers[0], padding=-15)

In [None]:
ax=sns.countplot(x=trainData.Risk_Flag, palette='Set2')
ax.bar_label(container=ax.containers[0], padding=-15)

In [None]:
sns.histplot(data=trainData.Age, color='midnightblue')

In [None]:
sns.boxplot(y=trainData.Age)

In [None]:
sns.histplot(data=trainData.Income)

In [None]:
sns.boxplot(y=trainData.Income)

In [None]:
sns.histplot(x=trainData.Experience, palette='Set2')

In [None]:
trainData.Experience.mean()

In [None]:
sns.histplot(x=trainData.CURRENT_JOB_YRS, palette='muted')

In [None]:
trainData.CURRENT_JOB_YRS.mean()

In [None]:
trainData.CURRENT_HOUSE_YRS.value_counts()

In [None]:
sns.histplot(x=trainData.CURRENT_HOUSE_YRS, palette='Set2')

In [None]:
sns.histplot(data=trainData.Income.loc[trainData.Risk_Flag == 0], color='skyblue', label='0')
sns.histplot(data=trainData.Income.loc[trainData.Risk_Flag == 1], color='gold', label='1')


In [None]:
trainData.Income.loc[trainData.Risk_Flag == 1].describe()

In [None]:
sns.histplot(data=trainData.Age.loc[trainData.Risk_Flag == 0], color='skyblue', label='0')
sns.histplot(data=trainData.Age.loc[trainData.Risk_Flag == 1], color='gold', label='1')

In [None]:
trainData.Age.loc[trainData.Risk_Flag == 1].describe()

In [None]:
sns.countplot(x=trainData.MaritalStatus, hue=trainData.Risk_Flag, palette='muted')

In [None]:
ax = sns.countplot(x=trainData.House_Ownership, hue=trainData.Risk_Flag, palette='muted')


In [None]:
sns.countplot(x=trainData.Car_Ownership, hue=trainData.Risk_Flag)

In [None]:
countMar=trainData.MaritalStatus.groupby(by=trainData.Risk_Flag).value_counts()

contTable = [[countMar[0][0], countMar[0][1]], [countMar[1][0], countMar[1][1]]]
contTable


In [None]:
contHouseOwner = np.array(trainData.House_Ownership.groupby(by=trainData.Risk_Flag).value_counts())
contHouseOwner = np.split(contHouseOwner, 3)
contHouseOwner

In [None]:
contCar = np.array(trainData.Car_Ownership.groupby(by=trainData.Risk_Flag).value_counts())
contCar = np.split(contCar,2)
contCar

In [None]:
from scipy.stats import chi2_contingency

def hipotestchi(table):
    
    stat, p, dof, expected = chi2_contingency(table)
    if p < 0.05:
        return 'Reject H0, Dependent'
    else:
        return 'Fail to reject H0, Independent'

In [None]:
print(hipotestchi(contTable))
print(hipotestchi(contHouseOwner))
print(hipotestchi(contCar))

In [None]:
corr_col=['Income', 'Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS','Risk_Flag']
corrmat = trainData[corr_col].corr()
plt.figure(figsize=(10, 8))

sns.heatmap(corrmat, annot=True, linewidth=0.5)

## Data Preparation

In [None]:
cols = ['Income', 'Age', 'Experience', 'CURRENT_HOUSE_YRS', 'Car_Ownership', 'House_Ownership', 'MaritalStatus', 'Risk_Flag']

dataTrain = trainData[cols]
dataTrain.head()

In [None]:
dataTrain.Car_Ownership = dataTrain.Car_Ownership.replace({'yes': 1, 'no':0})
dataTrain.head()

In [None]:
dataTrain.MaritalStatus=dataTrain.MaritalStatus.replace({'single':0, 'married':1})

In [None]:
dataTrain.House_Ownership.unique()

In [None]:
dataTrain.House_Ownership = dataTrain.House_Ownership.replace({'rented':0, 'norent_noown':1, 'owned':2})
dataTrain = dataTrain.rename(columns={'CURRENT_HOUSE_YRS':'Chouse_Years'})
dataTrain.head()

In [None]:
y = dataTrain['Risk_Flag']
X = dataTrain.drop('Risk_Flag', axis=1)

In [None]:
from imblearn.over_sampling import SMOTE

oversampler = SMOTE()
X, y = oversampler.fit_resample(X, y)

ax=sns.countplot(x = y, palette = 'muted')
ax.set_title('Dataset after oversampling')

In [None]:
X.head()

In [None]:
temp = X[['Income', 'Age', 'Experience', 'Chouse_Years']]
scaler = MinMaxScaler()
X_std = pd.DataFrame(scaler.fit_transform(temp), columns=['Income', 'Age', 'Experience', 'Chouse_Years'])
X_std.head()

In [None]:
temp2 = X[['Car_Ownership', 'House_Ownership', 'MaritalStatus']]
temp2.index=X_std.index
X_fix = pd.concat([X_std, temp2], axis=1)
X_fix.head()

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X_fix, y, test_size=0.2, random_state=0)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

## Model Building

In [None]:
logModel = LogisticRegression()
logModel.fit(x_train, y_train)
logPred = logModel.predict(x_valid)
print(classification_report(y_valid, logPred))

In [None]:
params = {'penalty':['l1', 'l2'],
         'solver':['liblinear'],
          'C':np.logspace(-4, 4, 20)
         }

CV = GridSearchCV(logModel, params, scoring='accuracy', cv=5)
CV.fit(x_train, y_train)

print(CV.best_score_)
print(CV.best_params_)

In [None]:
rfModel = RandomForestClassifier()
rfModel.fit(x_train, y_train)
rfPred = rfModel.predict(x_valid)
print(classification_report(y_valid, rfPred))

In [None]:
cm = confusion_matrix(y_valid, rfPred)
cmDisplay = ConfusionMatrixDisplay(cm).plot()

In [None]:
print('The accuracy of the trained model {}'.format(accuracy_score(y_valid, rfPred)*100))
print('The ROC AUC score of the trained model {}'.format(roc_auc_score(y_valid, rfPred)*100))

In [None]:
datatest = pd.read_csv(test_path, index_col='ID')
datatest.head()

In [None]:
datatest.rename(columns={'Married/Single':'MaritalStatus', 'CURRENT_HOUSE_YRS':'Chouse_Years'}, inplace=True)
cols = ['Income', 'Age', 'Experience', 'MaritalStatus', 'House_Ownership', 'Car_Ownership', 'Chouse_Years']
dataTest = datatest[cols]
dataTest.head()

In [None]:
dataTest.House_Ownership.unique()

In [None]:
dataTest.MaritalStatus = dataTest.MaritalStatus.replace({'single':0, 'married':1})
dataTest.Car_Ownership = dataTest.Car_Ownership.replace({'no':0, 'yes':1})
dataTest.House_Ownership = dataTest.House_Ownership.replace({'rented':0, 'norent_noown':1, 'owned':2})
dataTest.head()

In [None]:
temp = dataTest[['Income', 'Age', 'Experience', 'Chouse_Years']]
scaler = MinMaxScaler()
temp_std = pd.DataFrame(scaler.fit_transform(temp), columns=['Income', 'Age', 'Experience', 'Chouse_Years'])

temp_std.head()

In [None]:
temp2 = dataTest[['MaritalStatus', 'House_Ownership', 'Car_Ownership']]

temp2.index = temp_std.index
x_test = pd.concat([temp_std, temp2], axis=1)
x_test.head()

In [None]:
rfTestPred = rfModel.predict(x_test)