In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Libraries

In [None]:
# for data visualization
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns
import missingno as msno

# for pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# for machine learning modelling
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix


# for ignoring warnings
import warnings
warnings.filterwarnings("ignore")

### Load Datasets

- After a careful study and consideration of all given files and the goal of the problem, we can notice that most files contain info about the `previous loans`, which means if we depend on these info, our model will `struggle in generalization` with new clients with `no previous loans experience or info`, so we will try to make the best use of `only application info` to make a model can generalize well.

In [None]:
# training dataset
train_df = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv', index_col='SK_ID_CURR')
train_df

In [None]:
# testing dataset
test_df = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv', index_col='SK_ID_CURR')
test_df

In [None]:
# datasets sizes
print(f'Training dataset contains {train_df.shape[0]} records and {train_df.shape[1]} columns.')
print(f'Testing dataset contains {test_df.shape[0]} records and {test_df.shape[1]} columns.')

### Exploratory Data Analysis (EDA)

#### Check Missing Values (NaNs) 

In [None]:
# all NaNs
print(f'Total training NaNs = {train_df.isnull().sum().sum()}')
print(f'Total Testing NaNs = {test_df.isnull().sum().sum()}')

In [None]:
# Only columns with NaNs count and percentage
columns = train_df.isnull().sum()[train_df.isnull().sum() != 0].keys()
nans_count = train_df.isnull().sum()[train_df.isnull().sum() != 0].values
nans_percentage = train_df.isnull().sum()[train_df.isnull().sum() != 0].values/train_df.shape[0]

# create a dataframe from the extracted info. 
nans_df = pd.DataFrame({'Column':columns, 'No. of NaNs':nans_count, '% of NaNs in Column':nans_percentage*100})
nans_df = nans_df.sort_values(by='% of NaNs in Column', ascending=False)
nans_df

In [None]:
# visualize columns with NaNs distribution over target
msno.matrix(train_df[list(columns)+['TARGET']].sort_values(by='TARGET'))

- NaNs distribution is random, so imputations won't be biased towards certain target class.
- Columns with large percentage of NaNs (> 40%) will be dropped.
- Other columns will be imputed according the column dtype.

#### Check Duplicates

In [None]:
# sum of all duplicated records in data
train_df.duplicated().sum()

- No duplicates in our dataset

#### Check Target Column

In [None]:
# target value counts per each class
print('Count of Each Class\n' + '-'*20)
print(train_df['TARGET'].value_counts())

In [None]:
# visualize TARGET value counts
train_df['TARGET'].value_counts().plot(kind='bar');
plt.title('Target Classes Value Counts')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

- Most of clients can repay their loans, and a few are with difficulties, so data is `imbalanced`. In this case we must keep in mind 2 things:
    - data should `be balanced`
    - `accuracy is not a proper` evaluation metric, it will be misleading,but `F1-score or ROC AUC scores is proper` in this case.

#### Check Columns dtypes

In [None]:
# all columns dtypes
train_df.dtypes.value_counts()

In [None]:
# go deeper with object columns and extract the unique classes in each column
## create a dataframe with object dtype columns
unique_df = pd.DataFrame(train_df.select_dtypes('object').nunique()).reset_index()
unique_df.columns = ['Column','No. of Unique Values']

## extract the unique classes in each column
unique_df['Unique Values'] = unique_df['Column'].apply(lambda x: train_df[x].unique())
unique_df.sort_values(by='No. of Unique Values')

In [None]:
# check CODE_GENDER column in training and testing datasets
print('Train Dataset')
print(train_df['CODE_GENDER'].value_counts())

print('\nTest Dataset')
print(test_df['CODE_GENDER'].value_counts())

- CODE_GENDER with `XNA will be dropped` as it's only exists in 4 records and not exists in testing dataset.
- All columns are `catagorical and nominal`, so the best encoding technique is `OneHotEncoding`, but it will increase our dimensions, so we will `drop first column` or may `use LabelEnconding`

### Columns Correlation

In [None]:
# Top 5 Columns with Positive Correlation with our TARGET 
print('Top 5 Columns with Positive Correlation with TARGET\n', '-'*50)
print(train_df.corr()['TARGET'].sort_values().tail(5))

# Negative ones
print('\nTop 5 Columns with Negative Correlation with TARGET\n', '-'*50)
print(train_df.corr()['TARGET'].sort_values().head(5))

- There's no high correlation between our target and any feature.
- External data sources is so important with our target.

#### Detect Outliers

In [None]:
# extract continuous columns
all_numerical_cols = list(train_df.select_dtypes(exclude='object').columns)

# continuous  columns are all columns excluding target and flags columns
cont_cols = [col for col in all_numerical_cols if col!="TARGET" and col[:5]!='FLAG_']
print(f'No. of continuous features = {len(cont_cols)}')

In [None]:
# draw boxplots for each continuous column
plt.figure(figsize=(25, 25))
for i, col in enumerate(cont_cols):
    plt.subplot(16, 5, i+1)
    sns.boxplot(data=train_df, x=col)
    plt.title(col)

In [None]:
# let's go deeper in these columns
## show only 20 columns at a time
for i in np.linspace(0,60,4, dtype=int):
    if i == 60:
        display(train_df[cont_cols[i:78]].describe())
    else:
        display(train_df[cont_cols[i:i+20]].describe())

- Suspicious data:
  - Tha maximum income of a the clients is about 30 times the maximum amount of the loans
  - All days features are in negative values
  - The maximum age of a client is 69 year
  - The maximum value in days employed is positive not negative, a typo! <br> + it's about 1000 years, how come! + the minimum value is about 49 years! client work in the same job for 49 years!
  - A clients own car with 91 age! was manufactured in 1927! "with respect to the competition year 2018"
  - The minimum days before application did a client change phone is zero!


- As shown almost all continous features have outliers, so we will either `normalize these features` if we use models sensitive to the outliers like Logestic Regression Model or use models that is not sensitive, less impacted and robust to outliers like `Tree-Based Models` or `deep neural network`.

- Features with suspicious data will be analyzed well to clean the wrong data.

#### Going Deeper with Suspicious data
- some points will need to create a new dataframe to analyze data better, so `susp_df` is a helpful dataframe for this purpose.

In [None]:
# 1- Tha maximum income of a the clients is about 30 times the maximum amount of the loans

## create dataframe with total income > 1M
susp_df1 = train_df[train_df['AMT_INCOME_TOTAL']>1e+6][['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','CNT_CHILDREN', 'TARGET']].sort_values(by='AMT_INCOME_TOTAL', ascending=False)

## create Credit/Income and Annuity/Income percentages
susp_df1['Credit/Income'] = susp_df1['AMT_CREDIT']/susp_df1['AMT_INCOME_TOTAL']
susp_df1['Annuity/Income'] = susp_df1['AMT_ANNUITY']/susp_df1['AMT_INCOME_TOTAL']

## show only clients with difficuties
susp_df1[susp_df1['TARGET']==1].sort_values(by='Credit/Income', ascending=True)

- 1st record with Credit/Income < 0.005 and income over 117 million is not logical. It must be wrong.
- Other records still suspicious, but almostly the other features plays a vital role in the prediction. 

In [None]:
# 2- All days features are in negative values
## we will just take the absolute value for these features or keep it, it won't affect our models performance

In [None]:
# 3- The maximum age of a client is 69 year

## extract dataframe with DAYS_BIRTH and TARGET only
susp_df2 = train_df[['DAYS_BIRTH','TARGET']]

## create column represnts the age in years
susp_df2['YEARS_BIRTH'] = np.abs(susp_df2['DAYS_BIRTH']) / 365.25

## show datafame
display(susp_df2.sort_values(by='YEARS_BIRTH', ascending=False))

## show the value counts of those who are aged > 65 with respect to target
display(susp_df2[(susp_df2['YEARS_BIRTH']>65)]['TARGET'].value_counts())

- There's many clients with these age and the most can repay, so it's not a wrong data.

In [None]:
# 4- The maximum value in days employed is positive not negative + it's about 100 years

## visualize the clients days of employment >=0
train_df[train_df['DAYS_EMPLOYED']>=0]['DAYS_EMPLOYED'].value_counts().plot(kind='bar');
plt.title('Specific Days of Employment Value Counts')
plt.xlabel('Days')
plt.ylabel('Count')
plt.show()

In [None]:
# the percentage of this value in our dataset
value_perctage = len(train_df[train_df['DAYS_EMPLOYED']==365243])/len(train_df) * 100
print('Records with this value represent {:.2f}% of all data.'.format(value_perctage))

- 18% of our dataset with this value, it may be an error and it must be `replaced with the mean or median value`.

In [None]:
# 5- A clients own car with 91 age! was manufactured in 1927!

## show the value counts of those who own cars aged > 60 with respect to target
display(train_df[train_df['OWN_CAR_AGE']>60][['OWN_CAR_AGE','TARGET']]['TARGET'].value_counts())

## show who owns car aged > 70 and its target class
display(train_df[train_df['OWN_CAR_AGE']>70][['OWN_CAR_AGE','TARGET']])

- Not a typo and doesn't affect in out target column

In [None]:
# 6- The minimum days before application did a client change phone is zero!

## extract dataframe with DAYS_LAST_PHONE_CHANGE = 0
susp_df3 = train_df[train_df['DAYS_LAST_PHONE_CHANGE']==0]
print('There\'re {} records with 0 value in DAYS_LAST_PHONE_CHANGE column'.format(len(susp_df3)))
print('These records represent {:.2f}% of all data.'.format(len(susp_df3)/len(train_df) * 100))

- These records may refer for clients with high age so they didn't have phones or work before, so we will check these features distribution.

In [None]:
# go deeper with the distributions

## convert birth and employment days into years
susp_df3['YEARS_BIRTH'] = susp_df3['DAYS_BIRTH']/-365.25
susp_df3['YEARS_EMPLOYED'] = susp_df3['DAYS_EMPLOYED']/-365.25

plt.figure(figsize=(12,5))
## age histogram for clients with DAYS_LAST_PHONE_CHANGE = 0
plt.subplot(1, 2, 1)
susp_df3['YEARS_BIRTH'].hist(bins=25)
plt.title('Distribution of Clients\' Age')
plt.xlabel('Age in Years')
plt.ylabel('Frequency')

## employment histogram for clients with DAYS_LAST_PHONE_CHANGE = 0 without the wrong value '365243'
plt.subplot(1, 2, 2)
susp_df3[susp_df3['YEARS_EMPLOYED']!=(365243/-365.25)]['YEARS_EMPLOYED'].hist(bins=25)
plt.title('Distribution of Clients\' Employment Years')
plt.xlabel('Employment Years')
plt.ylabel('Frequency')
plt.show()

- As shown in figures, 12% of our dataset with this value, it's an error as the most of clients have from 27:65 years and already most of them work and can earn money to have a phone, so it must be replaced with the mean or median value.

#### Continuous Columns Distribution

In [None]:
# create a distribution plot for each continuous feature
plt.figure(figsize=(25, 50))
for i, col in enumerate(cont_cols):
    plt.subplot(16, 5, i+1)
    sns.distplot(train_df[col])
    sns.distplot(test_df[col])

- Great! train and test dataset have the same distributions, so the trained model will mostly generalize and predict well.

#### Insights
- Although the best stage to analyze the data and extract insights is after the cleaning stage,<br> but in our case without having dtype errors or errors hard to be handled, we can do it now.
- some points will need to create a clean dataframe withour wrong data, so `proper_df` is a helpful dataframe for this purpose.

In [None]:
# Which gender applys more for loans?
# Is there relation between the gender and the ability to repay?

plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
train_df[train_df['CODE_GENDER']!='XNA']['CODE_GENDER'].value_counts().plot(kind='bar', title='Males VS Females Apply for Loans');
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
sns.countplot(data=train_df[train_df['CODE_GENDER']!='XNA'], x='CODE_GENDER', hue='TARGET');
plt.title('Males VS Females with Respect to Target')
plt.xlabel('Gender')
plt.show()

- Females applys for loans more than Males
- Gender not affect in our Target

In [None]:
# Which type of loan contract clients applys more for?

plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
train_df['NAME_CONTRACT_TYPE'].value_counts().plot(kind='bar', title='Cash VS Revolving Loans');
plt.xlabel('Contract Type')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
sns.countplot(data=train_df, x='NAME_CONTRACT_TYPE', hue='TARGET')
plt.title('Cash VS Revolving Loans with Respect to Target')
plt.xlabel('Contract Type')
plt.show()

- Most clients tend to take cash loans rather than revolving loans
- This feature won't affect our target

In [None]:
# Is there relation between the age and the ability to repay?

plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
(train_df['DAYS_BIRTH']/-365.25).plot(kind='hist', bins=50, title='Distribution of Clients\' Age');
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
proper_day_birth_df = train_df[['DAYS_BIRTH', 'TARGET']]
proper_day_birth_df['DAYS_BIRTH'] = proper_day_birth_df['DAYS_BIRTH']/-365.25
sns.distplot(proper_day_birth_df[proper_day_birth_df['TARGET']==0]['DAYS_BIRTH'], hist=False, label='Can Repay');
sns.distplot(proper_day_birth_df[proper_day_birth_df['TARGET']==1]['DAYS_BIRTH'], hist=False, label='Can\'t Repay');
plt.title('Distribution of Clients\' Age with Respect to Target')
plt.xlabel('Age')
plt.legend();
plt.show()

- Clients aged about 30 years are more likely to have difficulties with repay, where those aged about 40 can repay well.
- This feature will be important for the model.

In [None]:
# Does the client's No. of children affect the ability to repay?

plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
train_df['CNT_CHILDREN'].plot(kind='hist', bins=19, title='Distribution of Clients\' No. of Children');
plt.xlabel('No. of Children')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.distplot(train_df[train_df['TARGET']==0]['CNT_CHILDREN'], hist=False, label='Can Repay');
sns.distplot(train_df[train_df['TARGET']==1]['CNT_CHILDREN'], hist=False, label='Can\'t Repay');
plt.title('Clients\' No. of Children with Respect to Target')
plt.xlabel('No. of Children')
plt.legend()
plt.show()

- Client's without any children applys for loans more than others, and with increasing No. of children, client's don't tend to loan.

In [None]:
# Is there a relation between client income ant the amount of loan apply for?
# Does income and credit affect in the ability to repay?

plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
proper_income_df = train_df[train_df['AMT_INCOME_TOTAL']!=117000000.0]
proper_income_df['AMT_INCOME_TOTAL'] = proper_income_df['AMT_INCOME_TOTAL']/10000
proper_income_df['AMT_INCOME_TOTAL'].plot(kind='hist', bins=1000, title='Distribution of Clients\' Income');
plt.xlabel('Total Income')
plt.xlim([0,100])

plt.subplot(1, 2, 2)
sns.distplot(proper_income_df[proper_income_df['TARGET']==0]['AMT_INCOME_TOTAL'], hist=False, bins=1000, label='Can Repay');
sns.distplot(proper_income_df[proper_income_df['TARGET']==1]['AMT_INCOME_TOTAL'], hist=False, bins=1000, label='Can\'t Repay');
plt.title('Distribution of Clients\' Income with Respect to Target')
plt.xlabel('Total Income')
plt.xlim([0,100])
plt.legend();
plt.show()

In [None]:
# let's normalize to see the plot properly
proper_income_df['log_AMT_INCOME_TOTAL'] = np.log10(proper_income_df['AMT_INCOME_TOTAL']*10000)
proper_income_df['log_AMT_CREDIT'] = np.log10(proper_income_df['AMT_CREDIT'])

plt.figure(figsize=(8,8))
sns.lmplot(x='log_AMT_INCOME_TOTAL', y='log_AMT_CREDIT', data=proper_income_df, hue='TARGET');
plt.title('Relation Between Total Income & Loan Credit')
plt.xlabel('Total Income')
plt.ylabel('Loan Credit')
plt.show()

sns.distplot(proper_income_df[proper_income_df['TARGET']==0]['log_AMT_CREDIT'], hist=False, label='Can Repay');
sns.distplot(proper_income_df[proper_income_df['TARGET']==1]['log_AMT_CREDIT'], hist=False, label='Can\'t Repay');
plt.title('Distribution of Loan Credit with Respect to Target');
plt.xlabel('Loan Credit')
plt.legend();

- Client's with low income tends to apply for loans more than others with high income.
- The more client's income is, the larger loan amount apply for.
- Client't with income more than 3M tends always to repay, so this feature may help in our target
- Clients with income between 10 and 18 are less likely to repay, vice versa.


In [None]:
# what's most income type of clients?

ax = sns.countplot(data=train_df, x='NAME_INCOME_TYPE', hue='TARGET');
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90);
plt.title('Clients\' Income Type Value Counts with Respect to Target')
plt.xlabel('Income Type')
plt.show()

print('\n'*2)
train_df.groupby('NAME_INCOME_TYPE')['TARGET'].value_counts()

- Working clients are more willing to apply for loans more than others.
- Altought a few businessmen and students apply for loans, but they always repay.

In [None]:
# what's most high education degree for clients?

ax = sns.countplot(data=train_df, x='NAME_EDUCATION_TYPE', hue='TARGET');
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90);
plt.title('Clients\' High Education Level Value Counts with Respect to Target')
plt.xlabel('High Education Level')
plt.show()

print('\n'*2)
train_df.groupby('NAME_EDUCATION_TYPE')['TARGET'].value_counts()

- Clients with Secondary high education level are more willing to apply for loans more than others.
- Almost 98% of clients with Academic degree high education level can repay their loans.


In [None]:
# Is this relation between owning car or realty on applying for loans or repay ability?

plt.figure(figsize=(13,5))
plt.subplot(1, 2, 1)
sns.countplot(data=train_df, x='FLAG_OWN_CAR', hue='TARGET')
plt.title('No. of Clients Owning Cars with Resect to Target')
plt.xlabel('Owns Car?')

plt.subplot(1, 2, 2)
sns.countplot(data=train_df, x='FLAG_OWN_REALTY', hue='TARGET')
plt.title('No. of Clients Owning Realty with Resect to Target')
plt.xlabel('Owns Realty?')
plt.show()


plt.figure(figsize=(8,6))
train_df[train_df['TARGET']==0].groupby(['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']).count()['TARGET'].plot(kind='bar', color='#4984B8', width=0.3,  position=1, label='TARGET 0')
train_df[train_df['TARGET']==1].groupby(['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']).count()['TARGET'].plot(kind='bar', color='#9F1D35', width=0.3,  position=0, label='TARGET 1')
plt.title('Count of Clients Owning Car and Realty with Respect to Target');
plt.ylabel('Count')
plt.xlabel('Owns Car? Owns Realty?')
plt.xticks(rotation=0)
plt.legend()

- Client's who doesn't own car but owns realty tend to apply for loans more than others, vice versa.
- Most of both can repay well, but who owns both can repay more than who doesn't.

In [None]:
# IS rhere relation between employment year and the ability to repay

proper_days_empolyed_df = train_df[train_df['DAYS_EMPLOYED']!=365243]
proper_days_empolyed_df['YEARS_EMPLOYED'] = proper_days_empolyed_df['DAYS_EMPLOYED']/-365.25

plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
sns.distplot(proper_days_empolyed_df['YEARS_EMPLOYED'])
plt.title('Distribution of Clients\' Employment Years')
plt.xlabel('Employment Years')

plt.subplot(1, 2, 2)
sns.distplot(proper_days_empolyed_df[proper_days_empolyed_df['TARGET']==0]['YEARS_EMPLOYED'], hist=False, label='Can Repay');
sns.distplot(proper_days_empolyed_df[proper_days_empolyed_df['TARGET']==1]['YEARS_EMPLOYED'], hist=False, label='Can\'t Repay');
plt.title('Distribution of Clients\' Employment Years with Respect to Target');
plt.xlim([-5,20])
plt.xlabel('Employment Years')
plt.legend();

- Clients with employment years less than 5 years tend to apply for loans more than others, and they are less likely to repay, especially less than 2 years, vice versa.

### Data Cleaning & Preprocessing

In [None]:
# before start cleaning, we keep copy of datasets
train_copy = train_df.copy()
test_copy = test_df.copy()

#### Drop Columns with >40% NaNs

In [None]:
# extract these columns from nans_df
drop_cols = nans_df[nans_df['% of NaNs in Column']>40]['Column'].tolist()
keep_cols = [col for col in train_df.columns if col not in drop_cols]

# extract the new train dataframe
train_df = train_df[keep_cols]

# remove Target from keep_cols and create the new test dataframe
keep_cols.remove('TARGET')
test_df = test_df[keep_cols]

In [None]:
# check the new datasets shapes
print(train_df.shape)
print(test_df.shape)

#### Drop XNA records from CODE_GENDER column

In [None]:
# extract all records doesn't have XNA value in CODE_GENDER column
train_df = train_df[train_df['CODE_GENDER']!='XNA']

# check
train_df['CODE_GENDER'].value_counts()

#### Drop the wrong value in AMT_INCOME_TOTAL column

In [None]:
train_df = train_df[train_df['AMT_INCOME_TOTAL'] != 117000000.0]

#### Change the wrong value in DAYS_EMPLOYED and DAYS_LAST_PHONE_CHANGE columns

In [None]:
# DAYS_EMPLOYED column
train_df['DAYS_EMPLOYED'] = train_df['DAYS_EMPLOYED'].apply(lambda x: np.nan if x==365243 else x)
test_df['DAYS_EMPLOYED'] = test_df['DAYS_EMPLOYED'].apply(lambda x: np.nan if x==365243 else x)

In [None]:
# check
print(train_df['DAYS_EMPLOYED'].max())
print(test_df['DAYS_EMPLOYED'].max())

In [None]:
# DAYS_LAST_PHONE_CHANGE column
train_df['DAYS_LAST_PHONE_CHANGE'] = train_df['DAYS_LAST_PHONE_CHANGE'].apply(lambda x: np.nan if x==0.0 else x)
test_df['DAYS_LAST_PHONE_CHANGE'] = test_df['DAYS_LAST_PHONE_CHANGE'].apply(lambda x: np.nan if x==0.0 else x)

In [None]:
# check
print(train_df['DAYS_LAST_PHONE_CHANGE'].max())
print(test_df['DAYS_LAST_PHONE_CHANGE'].max())

### NaNs Imputation 
### Catagorical Features Encoding
- Istead of doing both label encoding for features with 2 unique catagories and one hot encoding for the rest, we can do one hot encoding for all features with and drop the first outcome column, as:
    - it will do it for us in one step
    - decrease the No. of features to prevent increasing dimensions and prevent overfitting

### MinMax Scaling
- get rid of the outliers

#### create one pipeline that impute NaNs with respect to columns dtype and then do One-Hot Encoding and Normalization
#### we will use this pipeline in `modelling stage`

In [None]:
# create a pipeline to deal with numerical features
## 1- impute with median as most of features cotain outliers
## 2- apply Min-Max Scaler get rid of the outliers
numeric_transformer = Pipeline(
    steps=[("num_imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())]
)

In [None]:
# create a pipeline to deal with catagorical features
## 1- impute with the most frequent class "mode" 
## 2- apply One-Hot Encoding
categorical_transformer = Pipeline(
    steps=[("cat_imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown='ignore', drop='first'))]
)

In [None]:
# create a column transformer instant
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, make_column_selector(dtype_exclude="object")),
        ("cat", categorical_transformer, make_column_selector(dtype_include="object")),
    ]
)

#### Data Splitting


In [None]:
# separate target out of features "predictors"
X = train_df.drop('TARGET', axis=1)
y = train_df['TARGET']

In [None]:
# data splitting
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

### Modelling

In [None]:
# create a function for trained models evaluation
def evaluate_model(model_pipeline):
    # prediction
    train_pred = model_pipeline.predict(X_train)
    test_pred = model_pipeline.predict(X_val)
    
    train_pred_proba = model_pipeline.predict_proba(X_train)
    test_pred_proba = model_pipeline.predict_proba(X_val)
    
    # evaluations
    print('Training & Validation ROC AUC Scores:\n', '-'*40)
    print('Training   roc auc score= {:.4f}'.format(roc_auc_score(y_train, train_pred_proba[:, 1])))
    print('Validation roc auc score= {:.4f}'.format(roc_auc_score(y_val, test_pred_proba[:, 1])))
    print('')
    print('Training & Validation Confusion Metrices:')
    print('Training   confusion matrix:\n', confusion_matrix(y_train, train_pred))
    print('Validation confusion matrix:\n', confusion_matrix(y_val, test_pred))

#### Without Target Classes Balancing 

- Random Forest

In [None]:
# create model instant and pipeline
rf = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=42)
rf_pipe1 = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", rf)])

# train model
rf_pipe1.fit(X_train, y_train)

# evaluate model
evaluate_model(rf_pipe1)

- Ada Boosting

In [None]:
# create model instant and pipeline
adaboost = AdaBoostClassifier(n_estimators=200, random_state=42)
ada_pipe1 = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", adaboost)])

# train model
ada_pipe1.fit(X_train, y_train)

# evaluate model
evaluate_model(ada_pipe1)

- Light Gradient Boosting

In [None]:
# create model instant and pipeline 
lgbm = LGBMClassifier(n_estimators=1000, num_leaves=36, random_state=42)
lgbm_pipe1 = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", lgbm)])

# train model
lgbm_pipe1.fit(X_train, y_train)

# evaluate model
evaluate_model(lgbm_pipe1)

- Models performance is very bad, it almost can't predict the minor target class even in training or in testing.

#### Target Classes Balancing
- As in our target major is 91% and minor is 9%, we `can't use either oversampling only` as the minor is very small or `downsampling only` as we will lose alot of our data, so we will `apply both oversample on minor class firstly, then downsample the major one`.

In [None]:
# create oversampler, downsampler instants
oversampler = SMOTE(sampling_strategy=0.25)                     # minor/major = 1/4
undersampler = RandomUnderSampler(sampling_strategy=0.75)       # minor/major = 3/4

- Random Forest

In [None]:
# create pipeline
rf = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=42)
steps = [('preprocessor', preprocessor), ('oversampler', oversampler), ('undersampler', undersampler), ('model', rf)]
rf_pipeline2 = Pipeline(steps=steps)

# train 
rf_pipeline2.fit(X_train, y_train)

# evaluate
evaluate_model(rf_pipeline2)

- Ada Boosting

In [None]:
# create pipeline
adaboost = AdaBoostClassifier(n_estimators=200, random_state=42)
steps = [('preprocessor', preprocessor), ('oversampler', oversampler), ('undersampler', undersampler), ('model', adaboost)]
ada_pipeline2 = Pipeline(steps=steps)

# train 
ada_pipeline2.fit(X_train, y_train)

# evaluate
evaluate_model(ada_pipeline2)

- Light Gradient Boosting

In [None]:
# create pipeline
lgbm = LGBMClassifier(n_estimators=500,num_leaves=36, random_state=42)
steps = [('preprocessor', preprocessor), ('oversampler', oversampler), ('undersampler', undersampler), ('model', lgbm)]
lgbm_pipeline2 = Pipeline(steps=steps)

# train 
lgbm_pipeline2.fit(X_train, y_train)

# evaluate
evaluate_model(lgbm_pipeline2)

- With target balancing, models `prediction become better` especially Ada Boost and Light GBM.
- Models is underfitting, so we will try to enhance our dataset with `doing feature engineering` to create new features that models can rely on.

### Feature Engineering

In [None]:
# column represent the credit/income percentage
X_train['Credit/Income'] = X_train['AMT_CREDIT']/X_train['AMT_INCOME_TOTAL']
X_val['Credit/Income'] = X_val['AMT_CREDIT']/X_val['AMT_INCOME_TOTAL']
test_df['Credit/Income'] = test_df['AMT_CREDIT']/test_df['AMT_INCOME_TOTAL']

In [None]:
# column represent the annuity/income percentage
X_train['Annuity/Income'] = X_train['AMT_ANNUITY']/X_train['AMT_INCOME_TOTAL']
X_val['Annuity/Income'] = X_val['AMT_ANNUITY']/X_val['AMT_INCOME_TOTAL']
test_df['Annuity/Income'] = test_df['AMT_ANNUITY']/test_df['AMT_INCOME_TOTAL']

In [None]:
# column represent days employed percentage
X_train['Employed/Birth'] = X_train['DAYS_EMPLOYED']/X_train['DAYS_BIRTH']
X_val['Employed/Birth'] = X_val['DAYS_EMPLOYED']/X_val['DAYS_BIRTH']
test_df['Employed/Birth'] = test_df['DAYS_EMPLOYED']/test_df['DAYS_BIRTH']

In [None]:
# flag represents if he's greater than 32 or not
X_train['Flag_Greater_32'] = (X_train['DAYS_BIRTH']/-365.25).apply(lambda x: 1 if x > 32 else 0)
X_val['Flag_Greater_32'] = (X_val['DAYS_BIRTH']/-365.25).apply(lambda x: 1 if x > 32 else 0)
test_df['Flag_Greater_32'] = (test_df['DAYS_BIRTH']/-365.25).apply(lambda x: 1 if x > 32 else 0)

In [None]:
# flag represents if his employmeny years is greater than 5 or not
X_train['Flag_Employment_Greater_5'] = (X_train['DAYS_EMPLOYED']/-365.25).apply(lambda x: 1 if x > 5 else 0)
X_val['Flag_Employment_Greater_5'] = (X_val['DAYS_EMPLOYED']/-365.25).apply(lambda x: 1 if x > 5 else 0)
test_df['Flag_Employment_Greater_5'] = (test_df['DAYS_EMPLOYED']/-365.25).apply(lambda x: 1 if x > 5 else 0)

In [None]:
# flag represents if his income is greater than the loan or not
X_train['Flag_Income_Greater_Credit'] = X_train['AMT_INCOME_TOTAL'] > X_train['AMT_CREDIT'] 
X_val['Flag_Income_Greater_Credit'] = X_val['AMT_INCOME_TOTAL'] > X_val['AMT_CREDIT'] 
test_df['Flag_Income_Greater_Credit'] = test_df['AMT_INCOME_TOTAL'] > test_df['AMT_CREDIT'] 

In [None]:
# create polynomial features of the top 3 pos & neg features with target
cols = ['DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT',
       'EXT_SOURCE_3', 'EXT_SOURCE_2', 'DAYS_EMPLOYED']

for col in cols:
    for i in [2,3]:
        X_train[f'{col}_power_{i}'] = X_train[col] ** i
        X_val[f'{col}_power_{i}'] = X_val[col] ** i
        test_df[f'{col}_power_{i}'] = test_df[col] ** i

### Modelling 2

- Random Forest

In [None]:
# create pipeline
rf = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=42)
steps = [('preprocessor', preprocessor), ('oversampler', oversampler), ('undersampler', undersampler), ('model', rf)]
rf_pipeline3 = Pipeline(steps=steps)

# train 
rf_pipeline3.fit(X_train, y_train)

# evaluate
evaluate_model(rf_pipeline3)

- Ada Boosting

In [None]:
# create pipeline
adaboost = AdaBoostClassifier(n_estimators=200, random_state=42)
steps = [('preprocessor', preprocessor), ('oversampler', oversampler), ('undersampler', undersampler), ('model', adaboost)]
ada_pipeline3 = Pipeline(steps=steps)

# train 
ada_pipeline3.fit(X_train, y_train)

# evaluate
evaluate_model(ada_pipeline3)

- Light GBM

In [None]:
# create model instant and pipeline 
lgbm = LGBMClassifier(n_estimators=500, num_leaves=36, random_state=42)
lgbm_pipe3 = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", lgbm)])

# train model
lgbm_pipe3.fit(X_train, y_train)

# evaluate model
evaluate_model(lgbm_pipe3)

- models predictions become slightly better, but LightGBM become worse
- let's try to `keep all the original features` and train the models again

### Keep All + Feature Engineering

In [None]:
# return the train_df and test_df from their copies
train_df = train_copy.copy()
test_df = test_copy.copy()

- Same cleanning as before

In [None]:
# doing the same cleaning as before

train_df = train_df[train_df['CODE_GENDER']!='XNA']

train_df = train_df[train_df['AMT_INCOME_TOTAL'] != 117000000.0]

train_df['DAYS_EMPLOYED'] = train_df['DAYS_EMPLOYED'].apply(lambda x: np.nan if x==365243 else x)
test_df['DAYS_EMPLOYED'] = test_df['DAYS_EMPLOYED'].apply(lambda x: np.nan if x==365243 else x)

train_df['DAYS_LAST_PHONE_CHANGE'] = train_df['DAYS_LAST_PHONE_CHANGE'].apply(lambda x: np.nan if x==0.0 else x)
test_df['DAYS_LAST_PHONE_CHANGE'] = test_df['DAYS_LAST_PHONE_CHANGE'].apply(lambda x: np.nan if x==0.0 else x)

In [None]:
# data splitting
X = train_df.drop('TARGET', axis=1)
y = train_df['TARGET']

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

- Same feature engineering as before

In [None]:
# column represent the credit/income percentage
X_train['Credit/Income'] = X_train['AMT_CREDIT']/X_train['AMT_INCOME_TOTAL']
X_val['Credit/Income'] = X_val['AMT_CREDIT']/X_val['AMT_INCOME_TOTAL']
test_df['Credit/Income'] = test_df['AMT_CREDIT']/test_df['AMT_INCOME_TOTAL']

In [None]:
# column represent the annuity/income percentage
X_train['Annuity/Income'] = X_train['AMT_ANNUITY']/X_train['AMT_INCOME_TOTAL']
X_val['Annuity/Income'] = X_val['AMT_ANNUITY']/X_val['AMT_INCOME_TOTAL']
test_df['Annuity/Income'] = test_df['AMT_ANNUITY']/test_df['AMT_INCOME_TOTAL']

In [None]:
# column represent days employed percentage
X_train['Employed/Birth'] = X_train['DAYS_EMPLOYED']/X_train['DAYS_BIRTH']
X_val['Employed/Birth'] = X_val['DAYS_EMPLOYED']/X_val['DAYS_BIRTH']
test_df['Employed/Birth'] = test_df['DAYS_EMPLOYED']/test_df['DAYS_BIRTH']

In [None]:
# flag represents if he's greater than 32 or not
X_train['Flag_Greater_32'] = (X_train['DAYS_BIRTH']/-365.25).apply(lambda x: 1 if x > 32 else 0)
X_val['Flag_Greater_32'] = (X_val['DAYS_BIRTH']/-365.25).apply(lambda x: 1 if x > 32 else 0)
test_df['Flag_Greater_32'] = (test_df['DAYS_BIRTH']/-365.25).apply(lambda x: 1 if x > 32 else 0)

In [None]:
# flag represents if his employmeny years is greater than 5 or not
X_train['Flag_Employment_Greater_5'] = (X_train['DAYS_EMPLOYED']/-365.25).apply(lambda x: 1 if x > 5 else 0)
X_val['Flag_Employment_Greater_5'] = (X_val['DAYS_EMPLOYED']/-365.25).apply(lambda x: 1 if x > 5 else 0)
test_df['Flag_Employment_Greater_5'] = (test_df['DAYS_EMPLOYED']/-365.25).apply(lambda x: 1 if x > 5 else 0)

In [None]:
# flag represents if his income is greater than the loan or not
X_train['Flag_Income_Greater_Credit'] = X_train['AMT_INCOME_TOTAL'] > X_train['AMT_CREDIT'] 
X_val['Flag_Income_Greater_Credit'] = X_val['AMT_INCOME_TOTAL'] > X_val['AMT_CREDIT'] 
test_df['Flag_Income_Greater_Credit'] = test_df['AMT_INCOME_TOTAL'] > test_df['AMT_CREDIT'] 

In [None]:
# create polynomial features of the top 3 pos & neg features with target
cols = ['DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT',
       'EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1']

for col in cols:
    for i in [2,3]:
        X_train[f'{col}_power_{i}'] = X_train[col] ** i
        X_val[f'{col}_power_{i}'] = X_val[col] ** i
        test_df[f'{col}_power_{i}'] = test_df[col] ** i

### Modelling 3

- Random Forest

In [None]:
# create pipeline
rf = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=42)
steps = [('preprocessor', preprocessor), ('oversampler', oversampler), ('undersampler', undersampler), ('model', rf)]
rf_pipeline4 = Pipeline(steps=steps)

# train 
rf_pipeline4.fit(X_train, y_train)

# evaluate
evaluate_model(rf_pipeline4)

- Ada Boosting

In [None]:
# create pipeline
adaboost = AdaBoostClassifier(n_estimators=200, random_state=42)
steps = [('preprocessor', preprocessor), ('oversampler', oversampler), ('undersampler', undersampler), ('model', adaboost)]
ada_pipeline4 = Pipeline(steps=steps)

# train 
ada_pipeline4.fit(X_train, y_train)

# evaluate
evaluate_model(ada_pipeline4)

- Light GBM

In [None]:
# create pipeline
lgbm = LGBMClassifier(n_estimators=500, num_leaves=36, random_state=42)
steps = [('preprocessor', preprocessor), ('oversampler', oversampler), ('undersampler', undersampler), ('model', lgbm)]
lgbm_pipeline4 = Pipeline(steps=steps)

# train 
lgbm_pipeline4.fit(X_train, y_train)

# evaluate
evaluate_model(lgbm_pipeline4)

### Prediction

In [None]:
# predict and export a submission file
pd.DataFrame({'SK_ID_CURR': test_df.index,'TARGET': lgbm_pipeline4.predict_proba(test_df)[:,1]}).to_csv('submission.csv', index=False)

### Done!