In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns ; sns.set_theme()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

![](https://www.economist.com/img/b/1280/720/90/sites/default/files/images/print-edition/20190706_IRD001_0.jpg)

Many people struggle to get loans due to insufficient or non-existent credit histories. And, unfortunately, this population is often taken advantage of by untrustworthy lenders.

So can we use the power of advanced data analysis techniques accompanied with modern machine learning algorithms to predict their clients repayment abilities

I think it's application of the concept "AI for good" , if we have a good model to predict client's repayment abilities , we may have decreased the number of people who take the loan and can't repay it so it will end with them to the jail , also we have directed the money of loans to people who deserves it

# Development Cycle

we will follow the data science agile development life cycle along our problem

![](https://i.pinimg.com/originals/8c/d7/5f/8cd75ffb7c2524686bb9c342ff8490c7.png)

# 1- Business understanding

Cost of risk is one of the biggest components in banks’ cost structure. Thus, even a slight improvement in credit risk modelling can translate in huge savings. That’s why Machine Learning is often implemented in this area

# 2- Data Mining

in a typical industry scenario data mining includes for example fetching customer's databases , web sracbing ... etc
fortunately in our problem , data is already collected so let's move to the next step

# 3- Data Cleaning and Transformation

In [None]:
# read data
dftrain = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
print('Training data shape: ', dftrain.shape)
dftrain.head()

**Tech Insight**

we think that the number of columns (features) may be very large (122 features) , that may slow down EDA process and also may lead to model overfitting while training , so we may need in feature engineering stage to lower the dimentions of the data , before doing EDA and training

## 3-1 Searching for missing values

In [None]:
# plot missing values
fig, ax = plt.subplots(figsize=(30,10))
sns.heatmap(dftrain.isnull(), cbar=False, ax=ax)

#### well ! , there are too much nan vlaues in our data
#### How to handle missing data ?!
#### 1 - naive methods like drop rows with missing values will shrink a lot our data corpus and we do not want to lose data
#### 2 - dropping features with high nan values still introducing issues , as dropped features may have had high impact on model perfomance
#### 3- our approach will be try to fit an advanced tree based classifier like xgboost which has his own strategy to handle missing values , we just wanted to see how important the features with high nan rate , if they are very important , we will try to use advanced imputation analysis 

## 3-2 Focus on important features

In [None]:
# categorical feature encoding
# using one hot encoding
dftrainenc = pd.get_dummies(dftrain)
dftrainenc.sample(10)

In [None]:
# fit xgboost on train data to rank features with respect to importance
from xgboost import XGBClassifier
# fit model no training data
X=dftrainenc.iloc[:,2:]
y=dftrainenc['TARGET']
model = XGBClassifier()
model.fit(X, y)
# feature importance
print(model.feature_importances_)

In [None]:
# plot top 50 most important features 
feat_imp = pd.Series(model.feature_importances_, index=X.columns)
feat_imp.nlargest(50).plot(kind='barh', figsize=(8,10))

#### fortunately the features with most nans are not from the most important features except external source feature , so our approach (in feature engineering section) is to drop heavy nan filled features except external source features because xgboost will impute them automatically during training

## 4- Data Exploration (EDA)

### 4-1 what is the relation between target and applicant number of childrens

In [None]:
import plotly.graph_objects as go
chtarget = pd.DataFrame(dftrainenc.groupby(['CNT_CHILDREN'])['TARGET'].count())
fig = go.Figure([go.Bar(x=chtarget.index, y=chtarget['TARGET'])])
fig.update_layout(title_text='relation between target and applicant number of childrens',
                 xaxis_title="Applicant children",
                 yaxis_title="Number of applications")
fig.show()

##### insight : single people or married with no children , dominates the applications pool , this insight may help marketing teams that they need to target this sector of customers

### 4-2 what is the relation between target and applicant Income

In [None]:
inctarget = pd.DataFrame(dftrainenc.groupby(['TARGET'])['AMT_INCOME_TOTAL'].mean())
fig = go.Figure([go.Bar(x=inctarget.index, y=inctarget['AMT_INCOME_TOTAL'])])
fig.update_layout(title_text='relation between target and applicant income',
                 xaxis_title="Target",
                 yaxis_title="applicants mean income per category in target")
fig.show()

##### insight : this graph show how critical our problem is , notice that ration between target 1 and 0 is nearly 1/11 so for every 1 applicant have diffculties repay the loan 11 applicant repayed the loan , the graph also show huge income gap as average income of minority nearly reaches average majority income , also the graph clearly guide admins to not to trust much people with high income 

### 4-3 what is the relation between target and applicant Income type

In [None]:
inctype = pd.crosstab(dftrain.NAME_INCOME_TYPE,dftrain.TARGET)
fig = go.Figure(data=[
    go.Bar(name='1 : has issues', x=inctype.index, y=inctype[1]),
    go.Bar(name='0 : no issues', x=inctype.index, y=inctype[0])
])
# Change the bar mode
fig.update_layout(barmode='stack',title_text='relation between target and applicant Income type')
fig.show()

note : unemployed , student and maternity leave are not zero , you need to zoom the graph to find their numbers

### 4-4 what is the relation between target and applicant occupation

In [None]:
occtype = pd.crosstab(dftrain.OCCUPATION_TYPE,dftrain.TARGET)
fig = go.Figure(data=[
    go.Bar(name='1 : has issues', x=occtype.index, y=occtype[1]),
    go.Bar(name='0 : no issues', x=occtype.index, y=occtype[0])
])
# Change the bar mode
fig.update_layout(barmode='stack',title_text='relation between target and applicant Income type')
fig.show()

#### insight : it is logical that Laborers higst number of applications may be due to their unstable working environment , but a good insight they have very high potential of repaying the loan

## 5- Feature Engineering

### 5-1 feature selection

in previous section we used introduced the concept of feature importance to try to rank important features , in this section as experiment we will choose top 50 importna t features , then try to remove noise samples or outliers 

In [None]:
unwanted = []
for col in dftrainenc.columns:
    if col not in feat_imp.nlargest(50):
        unwanted.append(col)
dftrainenc.drop(columns=unwanted,inplace=True)
dftrainenc.sample(15)

In [None]:
finaldf = pd.concat([dftrainenc,dftrain['TARGET']],axis=1)

### 5-2 trying tackle imbalanced classes problem

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
X=finaldf.iloc[:,0:50]
y=finaldf['TARGET']
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.30)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
xtrain=imputer.fit_transform(xtrain)
xtrain=pd.DataFrame(xtrain,columns=X.columns)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
# define pipeline
# using SMOTE algorithm to synthically oversample minor samples
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
xtrain, ytrain = pipeline.fit_resample(xtrain, ytrain)

## 6- Predective Modeling

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=400)
model.fit(xtrain, ytrain)

## 7- Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score
predictions = model.predict(xtest) # test model against test set
preds_train = model.predict(xtrain)
print("Model Acurracy in testing = {}".format(accuracy_score(ytest, predictions))) # print test accuracy

#### accuracy score shows good performance on training and test sets , so we think it's a good fit model with no overfitting or underfitting , we think automated hyperparameters search may help boost accuracy