In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import gc
import warnings
import time
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
app_train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
app_test = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')

In [None]:
app_train.shape

In [None]:
app_test.shape

In [None]:
app_train.shape[0] + app_test.shape[0]

### Creating combined dataframe from train and test file
- Purpuse of combining train and test file is to handle data modification at same time on both file
- Once data pre-processing is done we can easily split it again with below logic
- if TARGET=NaN meaning its test file else its train file

In [None]:
df = app_train.append(app_test).reset_index()

In [None]:
del app_train
del app_test
gc.collect()

In [None]:
df.shape

### Considering basic numeric features

In [None]:
app_num_basic_col = [
'SK_ID_CURR',
'TARGET',
'CNT_CHILDREN',
'AMT_INCOME_TOTAL',
'AMT_CREDIT',
'AMT_ANNUITY',
'AMT_GOODS_PRICE',
'REGION_POPULATION_RELATIVE',
'DAYS_BIRTH',
'DAYS_EMPLOYED',
'DAYS_REGISTRATION',
'DAYS_ID_PUBLISH',
'CNT_FAM_MEMBERS',
'REGION_RATING_CLIENT',
'REGION_RATING_CLIENT_W_CITY',
'REG_REGION_NOT_LIVE_REGION',
'REG_REGION_NOT_WORK_REGION',
'LIVE_REGION_NOT_WORK_REGION',
'REG_CITY_NOT_LIVE_CITY',
'REG_CITY_NOT_WORK_CITY',
'LIVE_CITY_NOT_WORK_CITY']

In [None]:
app_cat_basic_col = ['NAME_CONTRACT_TYPE',
'FLAG_OWN_CAR',
'FLAG_OWN_REALTY',
'CODE_GENDER',
'NAME_TYPE_SUITE',
'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE',
'OCCUPATION_TYPE',
'ORGANIZATION_TYPE']

In [None]:
len(app_num_basic_col)

In [None]:
len(app_cat_basic_col)

- Creating dataframe with required columns only

In [None]:
df = df[app_num_basic_col + app_cat_basic_col]

In [None]:
df.shape

## EDA

In [None]:
def find_missing(data):
    ## Number of missing values
    missing_cnt = data.isnull().sum().values
    ## Total
    total = data.shape[0]
    ##Percentage of Missing values
    percentage = missing_cnt/total * 100
    missing_df = pd.DataFrame(data={'Total': total, 'Missing Count' : missing_cnt,'Percentage' : percentage}, 
                              index=data.columns.values)
    missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
    return missing_df

In [None]:
find_missing(df[app_num_basic_col])

### Handling missing values

In [None]:
df['AMT_GOODS_PRICE']=df['AMT_GOODS_PRICE'].fillna(df['AMT_GOODS_PRICE'].median())
df['AMT_ANNUITY']=df['AMT_ANNUITY'].fillna(df['AMT_ANNUITY'].median())
df['CNT_FAM_MEMBERS']=df['CNT_FAM_MEMBERS'].fillna(df['CNT_FAM_MEMBERS'].median())

In [None]:
find_missing(df[app_num_basic_col])

In [None]:
find_missing(df[app_cat_basic_col])

In [None]:
# Heatmap
plt.subplots(figsize=(20,12))
sns.heatmap(df[app_cat_basic_col].isnull(), yticklabels = False, cbar = False,cmap = 'tab20c_r')
plt.title('Missing Data: Training Set')
plt.show()

- Name Type Suite and Occupation type has missing values
- Occupation type has lots of missing value so for now droping this column
- Name Type suite will create some dummy NTS_XNA category for now

In [None]:
app_cat_basic_col.remove('OCCUPATION_TYPE')

In [None]:
df.drop('OCCUPATION_TYPE',inplace=True, axis=1)

In [None]:
df.shape

In [None]:
df['NAME_TYPE_SUITE']=df['NAME_TYPE_SUITE'].fillna('NTS_XNA')

In [None]:
def plot_categorical_bar(data, column):    
    plotdata = data[column].value_counts();

    fig = go.Figure(data=[
        go.Bar(x=plotdata.index, y=plotdata.values)
    ])
    fig.show()

In [None]:
def plot_categorical_pie(data, column, title, hole=.3):
    plotdata = data[column].value_counts();
    
    fig = go.Figure(data=[go.Pie(labels=plotdata.index, values=plotdata.values, hole=hole)])
    fig.update_layout(title_text=title)
    fig.show()

In [None]:
plot_categorical_pie(df, 'TARGET', 'Label Target ', .6)

In [None]:
plot_categorical_pie(df, 'NAME_INCOME_TYPE', 'House Type', .3)

In [None]:
plot_categorical_bar(df, 'NAME_HOUSING_TYPE')

- Draw distribution of numeric features

In [None]:
def plot_distributions(data, title, figsize, num_cols):
    
    column_names = list(data.columns.values)
    number_of_features = len(column_names)
    num_cols = num_cols
    num_rows = int(np.ceil(number_of_features*1./num_cols))
    fig = plt.figure(dpi=300, figsize = figsize)

    for i, feature in enumerate(column_names):
        feature_data = data[feature]
        filtered_feature_data = feature_data[~np.isnan(feature_data)]
        ax = fig.add_subplot(num_rows, num_cols, i+1)
        ax.hist(filtered_feature_data, bins = 25)
        ax.set_title("'%s' Distribution"%(feature), fontsize = 12)
        ax.set_xlabel("Value")
        ax.set_ylabel("Number of Borrowers")

    fig.suptitle(title, fontsize = 16, y = 1.03)

    fig.tight_layout()
    fig.show()

    fig.savefig('{}.png'.format(title))

In [None]:
num_column = ['CNT_CHILDREN',
'AMT_INCOME_TOTAL',
'AMT_CREDIT',
'AMT_ANNUITY',
'AMT_GOODS_PRICE',
'REGION_POPULATION_RELATIVE',
'DAYS_BIRTH',
'DAYS_EMPLOYED',
'DAYS_REGISTRATION',
'DAYS_ID_PUBLISH',
'CNT_FAM_MEMBERS',
'REGION_RATING_CLIENT',
'REGION_RATING_CLIENT_W_CITY',
'REG_REGION_NOT_LIVE_REGION',
'REG_REGION_NOT_WORK_REGION',
'LIVE_REGION_NOT_WORK_REGION',
'REG_CITY_NOT_LIVE_CITY',
'REG_CITY_NOT_WORK_CITY',
'LIVE_CITY_NOT_WORK_CITY']
plot_distributions(df[num_column], title='Distributions of Main Data Table\'s Normalized Features', figsize=(14,60), num_cols=3)

- Handling Outlier

In [None]:
sns.boxplot(data=df['DAYS_EMPLOYED'])

In [None]:
df['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment');

In [None]:
round(df[df['DAYS_EMPLOYED'] == 365243]['DAYS_EMPLOYED'].count() / len(df) * 100, 2)

In [None]:
round(df[df['DAYS_EMPLOYED'] != 365243]['DAYS_EMPLOYED'].count() / len(df) * 100 ,2)

- found that DAYS_EMPLOYED has some anomalies
- Around 18% of data amongs all data has some '365243' value in this fields
- as its not make sence to current data so we need to handle it somehow
- so i am replacing this value with np.nan
- creating new column called DAYS_EMPLOYED_ANOM Anomalous flag which will have True or False value based on this field

In [None]:
# Create an anomalous flag column
df['DAYS_EMPLOYED_ANOM'] = df["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

df['DAYS_EMPLOYED']=df['DAYS_EMPLOYED'].fillna(df['DAYS_EMPLOYED'].median())

(df['DAYS_EMPLOYED']/365).plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment');

After removing anomalies we can see above histogram that DAYS_EMPLOYED has maximum as 49 years and minimum is 0 year as discribe below

- Now add newly added column DAYS_EMPLOYED_ANOM to basic features list

### creating combined basic features from numerical and categorical

In [None]:
basic_features = app_num_basic_col + app_cat_basic_col 

In [None]:
len(basic_features)

In [None]:
find_missing(df[basic_features])

In [None]:
len(basic_features)

In [None]:
basic_features.append('DAYS_EMPLOYED_ANOM')

In [None]:
len(basic_features)

In [None]:
df[df['DAYS_EMPLOYED'] / -365 > 8]['DAYS_EMPLOYED'].count()

In [None]:
(df['DAYS_BIRTH'] / -365).describe()

In [None]:
df[df['CODE_GENDER'] == 'XNA']

In [None]:
df = df[df['CODE_GENDER'] != 'XNA']

In [None]:
df.shape

### Lable encoding for categorical features whose values are binary like Y/N, Yes/No, True/False, M/F etc.

In [None]:
df[['SK_ID_CURR','CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'DAYS_EMPLOYED_ANOM']].head(10)

In [None]:
# Categorical features with Binary encode (0 or 1; two categories)
for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'DAYS_EMPLOYED_ANOM']:
    df[bin_feature], uniques = pd.factorize(df[bin_feature])

In [None]:
df[['SK_ID_CURR','CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'DAYS_EMPLOYED_ANOM']].head(10)

out of above basic categorical features we already encoded binary 
- FLAG_OWN_CAR
- FLAG_OWN_REALITY
- CODE_GENDER
- DAYS_EMPLYED_ANOM

Now doing one hot encoding for remaining features
- NAME_CONTRACT_TYPE
- NAME_TYPE_SUITE
- NAME_INCOME_TYPE
- NAME_EDUCATION_TYPE
- NAME_FAMILY_STATUS
- NAME_HOUSING_TYPE
- ORGANIZATION_TYPE

In [None]:
one_hot_encode_col = ['NAME_CONTRACT_TYPE',
'NAME_TYPE_SUITE',
'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE',
'ORGANIZATION_TYPE']

In [None]:
dummy_df = pd.get_dummies(df[one_hot_encode_col], dummy_na=False, drop_first=True)

In [None]:
len(dummy_df.columns)

In [None]:
df.shape

In [None]:
len(basic_features)

In [None]:
df.drop(one_hot_encode_col, axis=1,inplace=True)

In [None]:
for f in one_hot_encode_col:
    basic_features.remove(f)

In [None]:
len(basic_features)

In [None]:
df.shape

### creating final dataframe with required features

In [None]:
len(df[basic_features].columns)

In [None]:
len(dummy_df.columns)

In [None]:
df = pd.concat([df[basic_features], dummy_df], axis=1)

In [None]:
del dummy_df
gc.collect()

In [None]:
df.shape

## Creating baseline model using LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 0.0001)

In [None]:
df.loc[df.TARGET.isnull()].shape

In [None]:
df.loc[df.TARGET.notnull()].shape

In [None]:
X_train = df.loc[df.TARGET.notnull()].drop('TARGET',axis=1)

In [None]:
find_missing(X_train)

In [None]:
y_train =  df.loc[df.TARGET.notnull()]['TARGET']

In [None]:
y_train.shape

In [None]:
X_test = df.loc[df.TARGET.isnull()].drop('TARGET', axis=1)

In [None]:
X_test.shape

In [None]:
# Train on the training data
log_reg.fit(X_train, y_train)

In [None]:
# Make predictions
# Make sure to select the second column only
log_reg_pred = log_reg.predict_proba(X_test)[:,1]

In [None]:
len(log_reg_pred)

In [None]:
log_reg_pred

In [None]:
submit = X_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.to_csv('logistic_regression_baseline.csv', index = False)

### Random Forest

In [None]:
final_features_list = df.columns.tolist()

In [None]:
final_features_list.remove('TARGET')

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

In [None]:
random_forest.fit(X_train, y_train)

feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': final_features_list, 'importance': feature_importance_values})

radon_forest_pred = random_forest.predict_proba(X_test)[:, 1]

In [None]:
def plot_feature_importances(df):
    
    df = df.sort_values('importance', ascending = False).reset_index()
    
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance'].head(15), 
            align = 'center', edgecolor = 'k')
    
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    plt.xlabel('Importance'); plt.title('Feature Importances')
    plt.show()

In [None]:
plot_feature_importances(feature_importances)

In [None]:
submit1 = X_test[['SK_ID_CURR']]
submit1['TARGET'] = radon_forest_pred

submit1.to_csv('random_forest_baseline.csv', index = False)