In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


%matplotlib inline
sns.set_style('darkgrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier

Many people struggle to get loans due to insufficient or non-existent credit histories. And, unfortunately, this population is often taken advantage of by untrustworthy lenders.

Home Credit strives to broaden financial inclusion for the unbanked population by providing a positive and safe borrowing experience. In order to make sure this underserved population has a positive loan experience, Home Credit makes use of a variety of alternative data--including telco and transactional information--to predict their clients' repayment abilities.

Let's load the dataset

In [None]:
train_df = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
print('Training data contains {} rows and {} columns '.format(train_df.shape[0],train_df.shape[1]))

In [None]:
train_df.head()

In [None]:
test_df = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
print('Test data contains {} rows and {} columns '.format(test_df.shape[0],test_df.shape[1]))

In [None]:
px.histogram(train_df, x='TARGET', color='TARGET',title='Repaid vs. No Repaid')

0 --> Individuals who paid their loan

1 --> Individuals who defaulted on their loan
 
There are far more loans that were repaid on time than loans that were not repaid



In [None]:
def missing_values(df):
    n_miss_val = df.isnull().sum()
    n_miss_per = 100 * df.isnull().sum() / len(df)
    miss_tbl = pd.concat([n_miss_val,n_miss_per],axis=1).sort_values(1,ascending=False).round(1)
    miss_tbl = miss_tbl[miss_tbl[1] !=0]
    
    miss_tbl = miss_tbl.rename(columns ={0: 'Missing Values',1:'%(Percentage) Missing Values'})
    print("{} columns that have missing values.".format(miss_tbl.shape[0]))
    
    return miss_tbl
    
    

In [None]:
missing_values_table = missing_values(train_df)
missing_values_table

Machine learning models can't work with missing numerical data. The process of filling missing values is called imputation so we will have to fill in these missing values (known as imputation).The data contains many missing values.Let's examine our Categorical and Numerical columns

In [None]:
def categorical_features(df):
    categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
    types=df[categorical_cols].dtypes
    uniques = df[categorical_cols].nunique()
    categorical_tbl = pd.concat([uniques,types],axis=1).sort_values(0,ascending=False).rename(columns = {0:'Unique Values',1:'Data Types'})
    print("The dataset contains {} categorical values".format(df[categorical_cols].shape[1]))
    
    return categorical_tbl

In [None]:
categorical_features(train_df)

Let's visualize categorical columns

In [None]:
categorical_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
fig , axs = plt.subplots(ncols=2,nrows=8,figsize=(30,50))
index=0
axs = axs.flatten()
for cols in categorical_cols:
        g = sns.countplot(x=cols,hue='TARGET',data=train_df,ax=axs[index],palette="spring")
        index +=1

In [None]:
def numerical_features(df):
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    numerical_tbl = pd.DataFrame(train_df[numerical_cols].dtypes).rename(columns = {0:'Data Types'})
    print("The dataset contains {} numerical values.(included target value)".format(df[numerical_cols].shape[1]))
    
    return numerical_tbl

In [None]:
numerical_features(train_df)

Let's convert the categorical columns to numeric value, will use `Label Encoding` for any categorical variables with only 2 categories and `One-Hot Encoding`for any categorical variables with more than 2 categories. We need to convert our categorical columns to numeric value, will use Label Encoding for any categorical variables with only 2 categories and One-Hot Encoding for any categorical variables with more than 2 categories. We will use pandas `get_dumies`,(convert categorical variable into dummy/indicator variables) for `One-Hot Encoding`.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

categorical_cols = train_df.select_dtypes(include=["object"])
for col in categorical_cols:
    if len(list(categorical_cols[col].unique())) <=2:
            le.fit(train_df[col])
            le.fit(test_df[col])
            train_df[col] = le.transform(train_df[col])
            test_df[col] = le.transform(test_df[col])

One-Hot Encoding for other categorical variables with [get_dummies](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html)

In [None]:
train_df= pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

print('Train data : ', train_df.shape)
print('Test data : ',test_df.shape)

 
#### Correlations
One way to try and understand the data is by looking for `correlations` between the features and the target.

Let's examine the columns that have a high correlation with the target value.

In [None]:
corrs = train_df.corr()['TARGET'].sort_values(ascending=False)
corr_top_df=pd.DataFrame(corrs.head(20))
corr_top_df
#Twenty most positive correlations

In [None]:
corrs = train_df.corr()['TARGET'].sort_values(ascending=False)
corr_top_df=pd.DataFrame(corrs.tail(20))
corr_top_df
#Twenty most negative correlations

`DAYS_BIRTH` has the positive correlation with `TARGET` 

`EXT_SOURCE_3`,`EXT_SOURCE_2` , `EXT_SOURCE_1` has most negative correlation with `TARGET`

We can examine these values in more detail.Let's take a look at Collinear Variables before examine the values

A [collinearity](https://medium.com/future-vision/collinearity-what-it-means-why-its-bad-and-how-does-it-affect-other-models-94e1db984168) is a special case when two or more variables are exactly correlated.
This means the regression coefficients are not uniquely determined. In turn it hurts the interpretability of the model as then the regression coefficients are not unique and have influences from other features.
Since we don't want this to happen, we will remove one of the highly correlated values.

In [None]:
    corr_matrx = pd.DataFrame(train_df).corr()
    corr_df = corr_matrx.where(np.triu(np.ones(corr_matrx.shape),k=1).astype(np.bool))
    corr_df =corr_df.unstack().reset_index()
    corr_df.columns = ['Feature_1','Feature_2','Correlation']
    corr_df.dropna(subset=['Correlation'],inplace=True)
    corr_df['Correlation'] = round(corr_df['Correlation'],3)
    corr_df['Correlation'] = abs(corr_df['Correlation'])
    mtrx = corr_df.sort_values(by = 'Correlation', ascending = False)
    mx_corr = mtrx[mtrx['Correlation'] > 0.75]
    print('{} highly correlated features were found.'.format(mx_corr.shape[0]))
    mx_corr.sample(10)


We can visuliaze the features with high correlation with each other in a simple way with the heatmap.

In [None]:
corr= train_df.corr()
kot = corr[corr>=.75]
plt.figure(figsize=(50,50))
sns.heatmap(kot,cmap="Greens");

For each of these pairs of highly correlated variables,we will remove one of the variables.We found 127 highly correlated features above. So lets remove features from test and train dataset that are unique.

In [None]:
#Features with high correlation value that we calculated earlier
colls=mx_corr['Feature_1'].unique()

In [None]:
test_df = test_df.drop(test_df[colls],axis=1)
train_df = train_df.drop(train_df[colls], axis = 1)
print('Training Features shape:',train_df.shape)
print('Testing Features shape:',test_df.shape)

After visulize the features that has high correlations, let's move on to the training model.

In [None]:
f,ax = plt.subplots(ncols=3,figsize=(10,5))

sns.distplot(train_df.EXT_SOURCE_1,kde=True,color="b",ax=ax[0])
sns.distplot(train_df.EXT_SOURCE_2,kde=True,color="g",ax=ax[1])
sns.distplot(train_df.EXT_SOURCE_3,kde=True,color="r",ax=ax[2]);

`DAYS_BIRTH` can analyze the conversion of the number of birthdays to years according to the age of the customers.

In [None]:

sns.histplot(train_df,x=abs(train_df['DAYS_BIRTH']/365),hue="TARGET",color="g",kde=True);
plt.title('Age of Client')
plt.xlabel('Ages');

In [None]:
fig = px.histogram(train_df,x=abs(train_df['DAYS_BIRTH']/365),color="TARGET")
fig.show()

Ages 25-30 can be seen as the age when payment cannot be made. As the age progresses, the problem of non-payment disappears.

Lets model the dataset after examine the correlations which is about target . Choosing features with high correlation also provides better predict in building our model.

In [None]:
target_corr = train_df.corr()['TARGET']
top_features = target_corr[(target_corr>-0.05) & (target_corr<0.05)]
train_df.drop(top_features.index,axis=1,inplace=True)
print('Training Features shape:',train_df.shape)

In [None]:
corrs = train_df.corr()['TARGET'].sort_values()
plt.subplots(figsize=(20,15))
plt.xticks(rotation=90)
plt.plot(corrs);

In [None]:
sns.set_theme(style="white")
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(train_df.corr(), dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 10))

# Generate a custom diverging colormap
cmap =sns.color_palette("Spectral", as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(train_df.corr(), mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True);

Before the train our model, let's extract `NaN,Null` values. We use imputation `SimpleImputer`.

The SimpleImputer class provides basic strategies for imputing missing values. Missing values can be imputed with a provided constant value, or using the statistics (mean, median or most frequent) of each column in which the missing values are located. This class also allows for different missing values encodings.[(more detail)](https://scikit-learn.org/stable/modules/impute.html#impute)

In [None]:
nan_cols=list(train_df.isnull().columns)
nan_cols

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan,strategy='mean')
imp.fit(train_df[nan_cols])
train_df[nan_cols] = imp.transform(train_df[nan_cols])

In [None]:
X ,y = (train_df.drop(['TARGET'],axis=1).values,train_df.TARGET.values)
print('Training Features shape:',X.shape)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=0)
print('X_train Features shape:',X_train.shape)
print('X_test Features shape:',X_test.shape)
print('y_train Features shape:',y_train.shape)
print('y_test Features shape:',y_test.shape)

In [None]:

def predict(chosed_model,name="Model"):
    mdl = chosed_model
    mdl = mdl.fit(X_train,y_train)
    y_prob = mdl.predict_proba(X_test)[:,1]
    y_pred = mdl.predict(X_test)

    print("Performances with {}".format(name))
    auc_test = roc_auc_score(y_test,y_prob)
    print("AUC Performance: ", auc_test)
    recall_test = recall_score(y_test, y_pred, average='weighted')
    print("Recall Performance: ", recall_test)

In [None]:
predict((DecisionTreeClassifier(criterion='gini')),'DecisionTreeClassifier')

In [None]:
predict((RandomForestClassifier(max_depth=4 , random_state=0)),'RandomForestClassifier')