In [None]:
# Basic libraries
import numpy as np # linear algebra
import pandas as pd # data processing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Split imbalanced dataset into train and test sets with stratification: #Only used in imbalanced classification probmlems
# This will make sure that in train and test there will be about 8% defaulters. # Remaining structure of original data.
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Making sure all numbers would not have scientific notaion (e+8 for example):
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Extra options for convenient
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [None]:
# Loading data
app_df = pd.read_csv("../input/loan-defaulter/application_data.csv")
app_df.head()

In [None]:
# Loading previous applications data
previous = pd.read_csv("../input/loan-defaulter/previous_application.csv")
previous.head()

In [None]:
# Checking size of application and previous data (examples, columns):
print("app_df shape:", app_df.shape, ",", "previous_df shape:", previous.shape)
print("")

In [None]:
# Basic statistics about TARGET features:
print(app_df['TARGET'].value_counts())

print("Defaulters percantage: ", (app_df[app_df['TARGET'] == 1]['TARGET'].count() / app_df[app_df['TARGET'] != 2]['TARGET'].count())*100,"%")

# Notice we're working with imbalanced data as just 8.07% are defaulters

In [None]:
# Creating a function that we're going to use during the project
# Function takes a feature from data as an argument, splits it by the different values and returns the probabilty that
## example with this value will default

def groupby_target(column):
    columnXtarget = app_df[[column, 'TARGET']].groupby(column, as_index=False).mean()
    columnXtarget = columnXtarget.sort_values('TARGET')
    return columnXtarget

# Dealing with nulls:
1. Saving all columns with more than 25% nulls
2. Changing categorial columns with high nulls % 
3. Examine correlation between these columns and TARGET. Delete the uncorrelated.

In [None]:
# Lets examine features in data 
## 65 floats, 41 ints, 16 objects
app_df.info()

In [None]:
# Checking null % value in each column:
round(app_df.isnull().sum() / app_df.shape[0] * 100.00,2)

In [None]:
# Visualization of null % per colums in app df:

null_app_df = pd.DataFrame((app_df.isnull().sum())*100/app_df.shape[0]).reset_index() #Creating new df with 2 columns. columns from app_df saved in rows, base on "coloumn name"
null_app_df.columns = ['Column Name', 'Null Values Percentage'] #Adding names to
fig = plt.figure(figsize=(18,6))
ax = sns.pointplot(x="Column Name",y="Null Values Percentage",data=null_app_df,color='b', alpha=1)
plt.xticks(rotation =90,fontsize =7)
ax.axhline(25, ls='--',color='red') ## Red line for over 25 % of nulls
plt.title("Percentage of Missing values in app data")
plt.ylabel("Null Values PERCENTAGE")
plt.xlabel("COLUMNS")
ax.set_facecolor("k")
fig.set_facecolor("lightgrey")
plt.show()

In [None]:
#Saving high % nulls features in a list
high_null = null_app_df[null_app_df['Null Values Percentage'] > 25]
high_null

In [None]:
# Making a new column that is the sum of all avg information about the appartment. alot of null in these features. 
## We want to make sure we can delete all these columns
app_df['BUILDING_TOTAL_AVG'] = app_df.loc[:, 'APARTMENTS_AVG': 'NONLIVINGAREA_MEDI'].sum(axis=1)
app_df['BUILDING_TOTAL_AVG'].describe() # Statitstics about column

In [None]:
#Now we can check how many examples has 0 value in all appartment feauters
len(app_df[app_df['BUILDING_TOTAL_AVG'] == 0]) 

## Almost half of application dont have info about building. We'll delete these columns due to too many missing values.

## Examiming "OCCUPATION_TYPE" feature. The only categrical feature with high null %

In [None]:
# Checking values of occupation_type
app_df['OCCUPATION_TYPE'].value_counts()

## We might want to save this feature since it has many values that can be predictive

??# Lets classify similiar occupation types
first, lets check which types of occupations has higher avg income

In [None]:
# Using the function we've built and plotting the results. Y-axis is the probablity of being a defaulter 
groupby_target('OCCUPATION_TYPE').plot(kind='bar', x='OCCUPATION_TYPE', color='g')

### Insights: Big difference between highest to lowest probabilities
### Decision: Undelete occupation type column and complete missing values to "other"

In [None]:
# Changing all 30% missing values in occupation type feature to "other"
app_df['OCCUPATION_TYPE'].replace(np.nan, 'Other',regex=True, inplace=True)

# We've done examining high null % columns Lets check some statistics about flag documents 
### Sum all flag documents together to one column



In [None]:
app_df['FLAG_DOC_TOTAL'] = app_df.loc[:, 'FLAG_DOCUMENT_2': 'FLAG_DOCUMENT_21'].sum(axis=1)
app_df['FLAG_DOC_TOTAL'].describe()

### It looks like all the examples have only one variable marked "1"  
#### Flag document has binary value. Lets check the distribution inside these binary features

In [None]:
flags_df = app_df.loc[:, 'FLAG_DOCUMENT_2': 'FLAG_DOCUMENT_21']
flags_df.mean() 

### All features besides flag document 3 can be deleted due to all values are 0.

In [None]:
# Saving "Flag" features in a list to be deleted
FlagDocument_list = [col for col in app_df if col.startswith('FLAG_DOC')]

In [None]:
FlagDocument_list

# Deleting features with high % nulls or unnecessary from app_df

In [None]:
delete_col_app = high_null['Column Name'].tolist() + FlagDocument_list 
delete_col_app.remove('FLAG_DOCUMENT_3')
delete_col_app.remove('OCCUPATION_TYPE')
len(delete_col_app) 

#### 69 features to be delteted

In [None]:
# Lets delete columns and examine shape before and after:
print("app_df shape before deleting:", app_df.shape)
app_df.drop(labels=delete_col_app, axis=1, inplace=True)
print("app_df shape after deleting:", app_df.shape)

In [None]:
# Deleting EXT_SOURCE 2 and 3 from data because of uncertainty what it means
app_df = app_df.drop(labels=['EXT_SOURCE_2', 'EXT_SOURCE_3'], axis=1)

In [None]:
# Checking correlation between more unclear columns to "TARGET" feature in order to delete
## these fatures if they're uncorrelated.
obs_before_app = ['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'TARGET']
doc_corr = app_df[obs_before_app].corr()
fig = plt.figure(figsize=(15,10))
ax = sns.heatmap(doc_corr,
            xticklabels=doc_corr.columns,
            yticklabels=doc_corr.columns,
            annot = True,
            cmap ="RdYlGn",
            linewidth=1)

## Insights: No correlation between features to TARGET
## Decision: Deleting these columns

In [None]:
app_df = app_df.drop(labels=['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE'], axis=1)
app_df.shape

In [None]:
# Checking Statistics about AMT_REQ_CREDIT_BUREAU features. they had 13.5% missing values and we might want to delete them
## since they are hard to complete and might not be predictive.
a = groupby_target('AMT_REQ_CREDIT_BUREAU_YEAR') #Using built function to save probabilites of defaulting per value in df
plt.xticks(rotation=-45)
sns.barplot(x='AMT_REQ_CREDIT_BUREAU_YEAR', y='TARGET', data=a) #Visualization of results

### Insights: After examining AMT_REQ_CREDIT_BUREAU_YEAR feature (that has more unique values than the others), it seems like there is no much of a correlation between this feature to TARGET feature
### Decision: Due to graph result and the fact that these features has 13.5% missing values we'll delete these features

In [None]:
app_df = app_df.drop(labels=['AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_HOUR'], axis=1)
app_df.shape

# We've done dealing with missing values. Just 42 features left as we had to delete 3/4 of features due to high null %

# Staitistics about loan amount per repayers and defaulters

In [None]:
repayers = app_df[app_df['TARGET'] == 0]
defaulters = app_df[app_df['TARGET'] == 1]

In [None]:
#Plotting distribution
txt = ' Red - Repayers\n Blue - Defaulters'
fig = plt.figure(figsize=(12,7))
yx = sns.distplot(repayers[['AMT_ANNUITY']], hist=False, rug=True, color='b', bins=2000)
yx = sns.distplot(defaulters[['AMT_ANNUITY']], hist=False, rug=True, color='r', bins=2000)
plt.title('Distribution Of Loan Amount', fontdict={'fontsize':26} )
plt.xlim(0,200000)
fig.text(.01,.01,txt)
yx.set_facecolor("k")
fig.show()

# Lets Examine basic statistics about numeric features in order to get more information about their distribution

In [None]:
app_df.describe()

# Observations:
1. Max children value is much higher than 3rd quarter value. Maybe we should band children feature 
2. Amount income total and amount annuity has huge maximum value. Should check they are outliers
3. Features Days: (BIRTH, EMPLYOYED, REGISTRAION, ID_PUB, LAST_PHONE_CHANGE) has negative values. Well change them with "abs" function
4. Days birth and days emoloyed will be changed to Age, Year Employed
5. Days employed has a maximum value of 365243, which are 1000 years. we should check what those outliers meaning as they might be unemployed/ pensioners
6. Max family members value is much higher than 3rd quarter value. Consider band feature  

## First lets create repayers and defaulters df's for ongoing statistics

In [None]:
repayers = app_df[app_df['TARGET'] == 0]
defaulters = app_df[app_df['TARGET'] == 1]

In [None]:
# Statistics about children count:

print(app_df[['CNT_CHILDREN', 'TARGET']].groupby('CNT_CHILDREN', as_index=0).mean())
print("")
print('# It seems like some children count are unique and thats why their target value mean is 0 or 1. Lets check it out! #')
print("")
print("CHILDREN COUNT VALUES:")
print(app_df['CNT_CHILDREN'].value_counts())

### As we thought, just 555 examples has more than 3 children, and there is some unique values like 11 that has just 1 example with that value. we gonna have to band it apppropriately.

### Lets try and rank it. we gonna rank: 0, 1-2, 3-4, 5-6, 7+ ranks. Hopefully this is a good rank. We might change it later on 

In [None]:
# Creating new rank children column, base on total examples in data:
app_df.loc[app_df['CNT_CHILDREN'] < 1, 'CNT_CHILDREN_RANK'] = 1
app_df.loc[(app_df['CNT_CHILDREN'] > 0) & (app_df['CNT_CHILDREN'] <= 2), 'CNT_CHILDREN_RANK'] = 2
app_df.loc[(app_df['CNT_CHILDREN'] > 2) & (app_df['CNT_CHILDREN'] <= 4), 'CNT_CHILDREN_RANK'] = 3
app_df.loc[(app_df['CNT_CHILDREN'] > 4) & (app_df['CNT_CHILDREN'] <= 6), 'CNT_CHILDREN_RANK'] = 4
app_df.loc[app_df['CNT_CHILDREN'] > 6 , 'CNT_CHILDREN_RANK'] = 5

groupby_target('CNT_CHILDREN_RANK')

### it seems to be more reasonable now, as we might think that more children make it harder to be a repayer.

## Lets do the same with family members

In [None]:
# Statistics about FAM_MEMBERS:
print(app_df[['CNT_FAM_MEMBERS', 'TARGET']].groupby('CNT_FAM_MEMBERS', as_index=0).mean())
print("")

print("FAM_MEMBERS COUNT VALUES:")
print(app_df['CNT_FAM_MEMBERS'].value_counts())

### We're going to do the same with fam_members. rank it base on values

In [None]:
app_df.loc[app_df['CNT_FAM_MEMBERS'] < 2, 'CNT_FAM_MEMBERS_RANK'] = 1
app_df.loc[(app_df['CNT_FAM_MEMBERS'] > 1) & (app_df['CNT_FAM_MEMBERS'] <= 2), 'CNT_FAM_MEMBERS_RANK'] = 2
app_df.loc[(app_df['CNT_FAM_MEMBERS'] > 2) & (app_df['CNT_FAM_MEMBERS'] <= 4), 'CNT_FAM_MEMBERS_RANK'] = 3
app_df.loc[(app_df['CNT_FAM_MEMBERS'] > 4) & (app_df['CNT_FAM_MEMBERS'] <= 6), 'CNT_FAM_MEMBERS_RANK'] = 4
app_df.loc[app_df['CNT_FAM_MEMBERS'] > 6, 'CNT_FAM_MEMBERS_RANK'] = 5

app_df[['CNT_FAM_MEMBERS_RANK', 'TARGET']].groupby('CNT_FAM_MEMBERS_RANK', as_index=0).mean()

#### It does seems like more fam members = higher chance being a deaulter. But, its not as obvious as it was with "CNT_CHILDREN".

## Next, we're going to change all features that count by 'DAYS', which counts by negative values to positive values

In [None]:
# Saving all columns starting with 'DAYS' since those are the columns we want to change
minus_col = [col for col in app_df if col.startswith('DAYS')]
minus_col

In [None]:
# Applying "abs" (absolut) function to all columns starting with 'DAYS'. 
app_df[minus_col]= abs(app_df[minus_col])

In [None]:
# Changing relevant days counted features to years to observe some statistics about them. (Only 2 at the moment(!))
app_df['Age'] = app_df['DAYS_BIRTH'] / 365
app_df['Years Employed'] = app_df['DAYS_EMPLOYED'] / 365 

In [None]:
# Checking for outliers in Years Employed, as we observed before:
plt.figure(figsize=(8,4))
ax = sns.distplot(app_df['Years Employed'], color="y")
plt.title('Distribution of Years Employed', fontdict={'fontsize':20} )
ax.set_facecolor("k")
plt.show()

## Insights: Alot of examples has 1000 years of working value. should treat it
## Lets check who are those examples

In [None]:
app_df[app_df['Years Employed'] > 900]['NAME_INCOME_TYPE'].unique()

# Decision: We'll change pensioners years employed value to 25 and and unemployed years employed to 0

In [None]:
#Using python mask built in function, we can change years employed of pensioners and unemployed easily and deal
## with those outliers smartly.

#Pensioners to 25
app_df['Years Employed'] = app_df["Years Employed"].mask((app_df["Years Employed"] > 900) & (app_df['NAME_INCOME_TYPE'] == 'Pensioner'), 25)
#Unemployed to 0     
app_df['Years Employed'] = app_df["Years Employed"].mask((app_df["Years Employed"] > 900) & (app_df['NAME_INCOME_TYPE'] == 'Unemployed'), 0)

len(app_df[app_df['Years Employed'] > 900]) #Prints 0 if no more outliers                                                                                                             

# Lets deal with data about cell phone, work phone and so on

In [None]:
print(groupby_target('FLAG_EMP_PHONE'))
print("------------------------------")
print(groupby_target('FLAG_CONT_MOBILE'))
print("------------------------------")
print(groupby_target('FLAG_PHONE'))
print("------------------------------")
print(groupby_target('FLAG_EMAIL'))
print("------------------------------")
print(groupby_target('FLAG_MOBIL'))

In [None]:
print(app_df['FLAG_MOBIL'].value_counts())

### At this point we'll only drop the flag_mobil column since only 1 example did not provide his phone (We can observe distribution around target in the last section of above code). No reason to delete other columns since they has good spread and may contribute to the model

In [None]:
app_df = app_df.drop(['FLAG_MOBIL'], axis=1)

# Checking some statistics about clients address validity:

In [None]:
print(app_df['REG_REGION_NOT_LIVE_REGION'].value_counts())
print('-'*50)
print(app_df['REG_REGION_NOT_WORK_REGION'].value_counts())
print('-'*50)
print(app_df['LIVE_REGION_NOT_WORK_REGION'].value_counts())
print('-'*50)
print(app_df['REG_CITY_NOT_LIVE_CITY'].value_counts())
print('-'*50)
print(app_df['REG_CITY_NOT_WORK_CITY'].value_counts())
print('-'*50)
print(app_df['LIVE_CITY_NOT_WORK_CITY'].value_counts())

### It seems like we can work with this columns
##### We can also think about synthethic feature that sums all these together.

In [None]:
# Lets observe null % again:
round(app_df.isnull().sum() / app_df.shape[0] * 100.00,2)

# Lets clean some last nulls in:
- AMT_ANNUITY
- CNT_FAM_MEMBERS_RANK
- AMT_GOODS_PRICE
- DAYS_LAST_PHONE_CHANGED

In [None]:
# Basic fill of nulls by mean or median:
app_df['AMT_ANNUITY'] = app_df['AMT_ANNUITY'].fillna(app_df['AMT_ANNUITY'].mean())
app_df['CNT_FAM_MEMBERS_RANK'] = app_df['CNT_FAM_MEMBERS_RANK'].fillna(app_df['CNT_FAM_MEMBERS_RANK'].median())
app_df['DAYS_LAST_PHONE_CHANGE'] = app_df['DAYS_LAST_PHONE_CHANGE'].fillna(app_df['DAYS_LAST_PHONE_CHANGE'].mean())

# Sophisticated fill of nulls:
# For each example that has null in (AMT_GOODS_PRICE), we'll take the mean of amt_goods_price that is 
##relevant to the age ot the null example
app_df['AMT_GOODS_PRICE'] = app_df['AMT_GOODS_PRICE'].fillna(app_df.groupby('Age')['AMT_GOODS_PRICE'].transform('mean'))

# At this point we'll delete some more numeric features: 
## Days birth and employed changed to years. Family members count changed to ranks. 

In [None]:
print(app_df.shape)
app_df = app_df.drop(labels=['DAYS_BIRTH', 'DAYS_EMPLOYED', 'BUILDING_TOTAL_AVG', 'CNT_FAM_MEMBERS'], axis=1)
print(app_df.shape)

# All other continuous values (and some others) will be normalized later # 

# --------------------------------------------------------------------------------------

# Finished Numeric features. Lets start dealing with some categorical features

In [None]:
# Basic statistics about categorical columns:
app_df.describe(include=['O']) 

In [None]:
# Changing XNA 4 values in gender feature to the more common one (women)
app_df['CODE_GENDER'].replace({'XNA': "F"}, inplace=True)

# Pie plots for statistics about genders

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3)

for d in [
    {'df': app_df, 'ax': ax1, 'title': 'Gender Distribution in data', 'cp': 'bright'},
    {'df': repayers, 'ax': ax2, 'title': 'Distribution of repayers per gender', 'cp': 'deep'},
    {'df': defaulters, 'ax': ax3, 'title': 'Distribution of defaulters per gender', 'cp': 'deep'},
]:
    d['ax'].set_title(d['title'], fontsize=15)
    d['df']["CODE_GENDER"].value_counts().plot.pie(
        ax=d['ax'], autopct="%1.0f%%", fontsize=15, figsize=(15,8),
        colors=sns.color_palette(d['cp']),
        wedgeprops={"linewidth":2,"edgecolor":"white"},shadow=False
    )

# Insights: Distribution within repayers preserves almost same original gaps within genders.
# Distribution of defaulters is different. percentage gap between genders is smaller. the 1% diff from the original distribution making a big change within defaulters

In [None]:
# Lets visuazlize some statitstics about genders and incomes:
plt.figure(figsize=(10,6))
ax = sns.pointplot(x='TARGET', y='AMT_INCOME_TOTAL', hue='CODE_GENDER', data=app_df)
ax.set_facecolor("#f2f2f2")
plt.show()

## Insights:
1. Men: In the transition from repayers to defaulters, average income is decreasing.
2. Women: In the transition from repayers to defaulters, average income stays the same.

In [None]:
#Changing genders and other features to binary
app_df['CODE_GENDER'].replace({'F':0, 'M':1}, inplace=True)
app_df['NAME_CONTRACT_TYPE'].replace({'Cash loans':0, 'Revolving loans':1}, inplace=True)
app_df['FLAG_OWN_CAR'].replace({'N':0, 'Y':1}, inplace=True)
app_df['FLAG_OWN_REALTY'].replace({'Y':0, 'N':1}, inplace=True)

In [None]:
app_df.describe(include=['O']) #Checking relevant columns have been changed from categorical.

## Dealing with "NAME_TYPE_SUITE" feature

In [None]:
#Checking for missing values
app_df['NAME_TYPE_SUITE'].isnull().sum()

In [None]:
# Checking values count
app_df['NAME_TYPE_SUITE'].value_counts()

In [None]:
# Replacing nulls to unaccompanied (most common feature)
app_df['NAME_TYPE_SUITE'].replace(np.nan, 'Unaccompanied',regex=True, inplace=True)

# Dealing with "NAME_INCOME_TYPE" feature. There are 8 different types there. Lets check values count, and avg target per type.

In [None]:
app_df['NAME_INCOME_TYPE'].value_counts()

### 4 Types have low values amount. We will change these to "WORKING" since its the most common one.

In [None]:
app_df['NAME_INCOME_TYPE'].replace(['Businessman', 'Student', 'Unemployed', 'Maternity leave'], 'Working', inplace=True)

In [None]:
# Statistics about income type
groupby_target('NAME_INCOME_TYPE').plot(kind='bar', x='NAME_INCOME_TYPE', color='#ff5522') # Visual of avg target per type after changing.

## Insights: Difference between probabilities to default are small

## Decision: We will use one hot encoding method later on

# Dealing with "EDUCATION_TYPE feature

In [None]:
# Visualization
plt.figure(figsize=(12.5,8.5))
app_df["NAME_EDUCATION_TYPE"].value_counts().plot.pie(autopct = "%1.0f%%",fontsize=10,
colors = sns.color_palette("cubehelix"),
wedgeprops={"linewidth":2,"edgecolor":"white"},shadow =False)

plt.title("Distribution of education type",color="g", fontsize=9)

## Insights: Amount of examples that has academic degree and applies for loan in app_df is lower than 1%

In [None]:
# Visualization of avg target per education type.
ax = sns.barplot(x='NAME_EDUCATION_TYPE', y='TARGET', data=app_df, color='y')
plt.xticks(rotation=-45)
ax.set_facecolor("k")
plt.title("Odds of being a defaulter per education type", fontsize=15)

## Insights: Examples with academic degree or higher educataion tend to default less

## Decision: We will use one hot encoding method later on

# Dealing with family status feature

In [None]:
app_df['NAME_FAMILY_STATUS'].value_counts()

In [None]:
#Changing unknown to married since its the most common value
app_df['NAME_FAMILY_STATUS'].replace('Unknown', 'Married',regex=True, inplace=True)
#Later on well use one hot method with this feature

# Dealing with "NAME_HOUSING_TYPE" value

In [None]:
app_df['NAME_HOUSING_TYPE'].value_counts()

### Spread in values is okay. We'll use one-hot encoding later on

# Dealing with "ORGANIZATION_TYPE" feature

In [None]:
# Visualization of probailites being a defaulter pre each value
groupby_target('ORGANIZATION_TYPE').plot(kind='bar', figsize=(12.5,6), x='ORGANIZATION_TYPE', y='TARGET', color='#000000' ,alpha=1)
plt.xticks(rotation=-90)

# Insights:
1. 12% Difference between highest to lowest occupatios.
2. this should be a good feature to out model since it has 58 different values. 
# Decision:
1. We will use one-hot encoding later on.

# Were going to use features from previous df in our model:
1. Creating smalls df's that contains 2 columns: "SK_ID_CURR", "COUNT"
##### It will measure the amount of times that applier from current application requested loan in previous data. We will also devide it to 4 different type of counts base on the previous loan decision (approved/canceled...)
2. We'll use pandas join function on id. This will add new columns to app_df base on our counts.
##### since some examples didnt apply for loan in the past (e.g dont have a row in previous df), their value will be "nan" in loan count column
3. Replacing nans in the new columns to 0, since nan is default applied for examples who didnt request loan in previous data

In [None]:
loan_counter = pd.DataFrame(previous['SK_ID_CURR'].value_counts()).reset_index() # Creating small df base on id count from previous df
loan_counter.columns = ['SK_ID_CURR', 'Total Loan Count'] #Adding names to counter

refused_df = pd.DataFrame(previous[previous['NAME_CONTRACT_STATUS'] == 'Refused'])
refused_counter = pd.DataFrame(refused_df['SK_ID_CURR'].value_counts()).reset_index() # Creating small df base on refused loan and id count from previous dfrefused_counter.columns = ['SK_ID_CURR', 'Refused Count'] #Adding names to counter
refused_counter.columns = ['SK_ID_CURR', 'Refused Count'] #Adding names to refused counter

approved_df = pd.DataFrame(previous[previous['NAME_CONTRACT_STATUS'] == 'Approved'])
approved_counter = pd.DataFrame(approved_df['SK_ID_CURR'].value_counts()).reset_index() # Creating small df base on approved loan and id count from previous dfrefused_counter.columns = ['SK_ID_CURR', 'Refused Count'] #Adding names to counter
approved_counter.columns = ['SK_ID_CURR', 'Approved Count'] #Adding names to approved counter

canceled_df = pd.DataFrame(previous[previous['NAME_CONTRACT_STATUS'] == 'Canceled'])
canceled_counter = pd.DataFrame(canceled_df['SK_ID_CURR'].value_counts()).reset_index() # Creating small df base on canceled loan and id count from previous dfrefused_counter.columns = ['SK_ID_CURR', 'Refused Count'] #Adding names to counter
canceled_counter.columns = ['SK_ID_CURR', 'Canceled Count'] #Adding names to canceled counter

unused_df = pd.DataFrame(previous[previous['NAME_CONTRACT_STATUS'] == 'Unused offer'])
unused_counter = pd.DataFrame(unused_df['SK_ID_CURR'].value_counts()).reset_index() # Creating small df base on un-used loan and id count from previous dfrefused_counter.columns = ['SK_ID_CURR', 'Refused Count'] #Adding names to counter
unused_counter.columns = ['SK_ID_CURR', 'Unused Count'] #Adding names to un-used counter

In [None]:
app_df = app_df.join(loan_counter.set_index('SK_ID_CURR'), on='SK_ID_CURR') # Using pandas join function on id, to add loan count
app_df = app_df.join(refused_counter.set_index('SK_ID_CURR'), on='SK_ID_CURR') # Using pandas join function on id, to add refused count
app_df = app_df.join(approved_counter.set_index('SK_ID_CURR'), on='SK_ID_CURR') # Using pandas join function on id, to add approved count
app_df = app_df.join(canceled_counter.set_index('SK_ID_CURR'), on='SK_ID_CURR') # Using pandas join function on id, to add cancelled count
app_df = app_df.join(unused_counter.set_index('SK_ID_CURR'), on='SK_ID_CURR') # Using pandas join function on id, to add unused count

## Adding app_df a column that describes if client requested insurance in his last application.
### We'll take only the last application using min "Days_Decision" value 

In [None]:
prev_insurance = previous.groupby('SK_ID_CURR', as_index=False).min('DAYS_DECISION')[['SK_ID_CURR', 'NFLAG_INSURED_ON_APPROVAL']]
app_df = app_df.join(prev_insurance.set_index('SK_ID_CURR'), on='SK_ID_CURR')

## In previous_df, we have "DAYS_DECISION feature which measures how many days has past since final payment date of application from previous_df, to application request in app_df 
### Days are relative to current application
#### Since some examples has more than one loan request in previous_df, we have to smartly take only the days since last application

In [None]:
previous['DAYS_DECISION'] = abs(previous['DAYS_DECISION']) # Converting and then adding -min- days decision to app_df
min_days_decision = previous.groupby('SK_ID_CURR', as_index=False).min('DAYS_DECISION')[['SK_ID_CURR', 'DAYS_DECISION']]
app_df = app_df.join(min_days_decision.set_index('SK_ID_CURR'), on='SK_ID_CURR') 

In [None]:
app_df.describe()

# Synthethic features:
## 3 Synthethic features to be added to try and make model more predictive

In [None]:
# Making synthteic features for total income devided by annuity amount. same for credit amount:
app_df['INC_ANNUITY'] = app_df['AMT_INCOME_TOTAL'] / app_df['AMT_ANNUITY']
app_df['CRED_ANNUITY'] = app_df['AMT_CREDIT'] / app_df['AMT_ANNUITY']

# Synthethic feature for age / (age+children). We want to make model know combined statistics about age and children's of client
app_df['ageXchildren'] = app_df['Age'] / (app_df['Age'] + app_df['CNT_CHILDREN']) 

In [None]:
# We can delete Children count feature now since its been ranked to new feature
app_df = app_df.drop(['CNT_CHILDREN'], axis=1)

In [None]:
app_df.isnull().sum()

# We have to deal with more null values added from previous_df
## We want to save these new features since they might be predictive
### But, we will complete nulls only after standard scaling these feature so that null values will be 0. 

In [None]:
app_df.shape

In [None]:
app_df['ORGANIZATION_TYPE'].replace(['Industry: type 8', 'Trade: type 5', 'Trade: type 4','Industry: type 13', 'Religion', 'Industry: type 10', 'Industry: type 6'], 'Other', inplace=True)

In [None]:
app_df.describe()

# 51 Features for model to use, before one-hot encoding

In [None]:
# dummies for all + deleting IMMEDIATELY original columns
app_df = pd.get_dummies(data=app_df, columns=['OCCUPATION_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'ORGANIZATION_TYPE', 'NAME_FAMILY_STATUS', 'WEEKDAY_APPR_PROCESS_START', 'CNT_FAM_MEMBERS_RANK', 'CNT_CHILDREN_RANK'])

In [None]:
app_df.shape

In [None]:
#Using sklearn standard scaler function
app_df[['DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE', 'Age', 'Years Employed', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'Total Loan Count', 'Refused Count', 'Approved Count', 'Canceled Count', 'Unused Count', 'DAYS_DECISION', 'INC_ANNUITY', 'CRED_ANNUITY', 'ageXchildren', 'NFLAG_INSURED_ON_APPROVAL']] = StandardScaler().fit_transform(app_df[['DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE', 'Age', 'Years Employed', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'Total Loan Count', 'Refused Count', 'Approved Count', 'Canceled Count', 'Unused Count', 'DAYS_DECISION', 'INC_ANNUITY', 'CRED_ANNUITY', 'ageXchildren', 'NFLAG_INSURED_ON_APPROVAL']])

# Lets complete all nulls to 0 after we normazlied them. now all the nulls will have 0 values and wont effect distribution.
app_df.fillna({'Total Loan Count':0,'Refused Count':0, 'Approved Count':0, 'Unused Count':0, 'Canceled Count':0, 'DAYS_DECISION':0, 'NFLAG_INSURED_ON_APPROVAL':0, 'SQR DecisionEmployed':0},inplace=True)

# Making sure that there is no more missing values

In [None]:
app_df.isnull().sum()

## Deleting final outliers, and completing missing values for new added features

In [None]:
delete_outliers_columns = ['AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'Total Loan Count', 'Refused Count', 'Approved Count', 'Canceled Count', 'Unused Count']
for column in delete_outliers_columns:
    app_df = app_df[app_df[column] < 35]

In [None]:
app_df.shape

# We're ready for predictions!
# Thats it for now. Modeling and predictions will be writtten in separate code