In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import gc
import warnings
import time
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
app_train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
app_test = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')

bureau = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau.csv')
bureau_bal = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau_balance.csv')

In [None]:
app_train.shape

In [None]:
app_test.shape

In [None]:
app_train.shape[0] + app_test.shape[0]

### Creating combined dataframe from train and test file
- Purpuse of combining train and test file is to handle data modification at same time on both file
- Once data pre-processing is done we can easily split it again with below logic
- if TARGET=NaN meaning its test file else its train file

In [None]:
df = app_train.append(app_test).reset_index()

In [None]:
del app_train
del app_test
gc.collect()

In [None]:
df.shape

### Considering basic numeric features

In [None]:
app_num_basic_col = [
'SK_ID_CURR',
'TARGET',
'CNT_CHILDREN',
'AMT_INCOME_TOTAL',
'AMT_CREDIT',
'AMT_ANNUITY',
'AMT_GOODS_PRICE',
'REGION_POPULATION_RELATIVE',
'DAYS_BIRTH',
'DAYS_EMPLOYED',
'DAYS_REGISTRATION',
'DAYS_ID_PUBLISH',
'CNT_FAM_MEMBERS',
'EXT_SOURCE_1',
'EXT_SOURCE_2',
'EXT_SOURCE_3',]

In [None]:
app_cat_basic_col = ['NAME_CONTRACT_TYPE',
'FLAG_OWN_CAR',
'FLAG_OWN_REALTY',
'CODE_GENDER',
'NAME_TYPE_SUITE',
'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE',
'OCCUPATION_TYPE',
'ORGANIZATION_TYPE']

In [None]:
len(app_num_basic_col)

In [None]:
len(app_cat_basic_col)

- Creating dataframe with required columns only

In [None]:
df = df[app_num_basic_col + app_cat_basic_col]

In [None]:
df.shape

## EDA And Pre-Processing 

In [None]:
def find_missing(data):
    ## Number of missing values
    missing_cnt = data.isnull().sum().values
    ## Total
    total = data.shape[0]
    ##Percentage of Missing values
    percentage = missing_cnt/total * 100
    missing_df = pd.DataFrame(data={'Total': total, 'Missing Count' : missing_cnt,'Percentage' : percentage}, 
                              index=data.columns.values)
    missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
    return missing_df

In [None]:
find_missing(df[app_num_basic_col])

### Handling missing values

In [None]:
df[app_num_basic_col].describe().transpose()

In [None]:
df[app_cat_basic_col].describe().transpose()

In [None]:
def describe_df(columns):
    for column in columns:
        print('{column} mean value={mean}, median value={median}'.format(column=column, mean=df[column].mean(), median=df[column].median()))
describe_df(['AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1'])

In [None]:
df['AMT_ANNUITY'].mean()

In [None]:
df['NAME_INCOME_TYPE'].unique()

In [None]:
df[df['DAYS_EMPLOYED'] == 365243][['DAYS_EMPLOYED', 'NAME_INCOME_TYPE']].groupby('NAME_INCOME_TYPE').count()

In [None]:
df['AMT_GOODS_PRICE']=df['AMT_GOODS_PRICE'].fillna(df['AMT_GOODS_PRICE'].median())
df['AMT_ANNUITY']=df['AMT_ANNUITY'].fillna(df['AMT_ANNUITY'].median())
df['CNT_FAM_MEMBERS']=df['CNT_FAM_MEMBERS'].fillna(df['CNT_FAM_MEMBERS'].median())
df['EXT_SOURCE_1']=df['EXT_SOURCE_1'].fillna(df['EXT_SOURCE_1'].median())
df['EXT_SOURCE_2']=df['EXT_SOURCE_2'].fillna(df['EXT_SOURCE_2'].median())
df['EXT_SOURCE_3']=df['EXT_SOURCE_3'].fillna(df['EXT_SOURCE_3'].median())

In [None]:
find_missing(df[app_num_basic_col])

In [None]:
find_missing(df[app_cat_basic_col])

In [None]:
# Heatmap
plt.subplots(figsize=(20,12))
sns.heatmap(df[app_cat_basic_col].isnull(), yticklabels = False, cbar = False,cmap = 'tab20c_r')
plt.title('Missing Data: Training Set')
plt.show()

- Name Type Suite and Occupation type has missing values
- Occupation type has lots of missing value so for now droping this column
- Name Type suite will create some dummy NTS_XNA category for now

In [None]:
app_cat_basic_col.remove('OCCUPATION_TYPE')

In [None]:
df.drop('OCCUPATION_TYPE',inplace=True, axis=1)

In [None]:
df.shape

In [None]:
df['NAME_TYPE_SUITE']=df['NAME_TYPE_SUITE'].fillna('NTS_XNA')

In [None]:
def plot_categorical_pie(data, column, title, hole=.3):
    plotdata = data[column].value_counts();
    
    fig = go.Figure(data=[go.Pie(labels=plotdata.index, values=plotdata.values, hole=hole)])
    fig.update_layout(title_text=title)
    fig.show()

In [None]:
def plot_categorical(data, column, size=[8,4], xlabel_angle=0, title=''):
    plotdata = data[column].value_counts();
    plt.figure(figsize=size)
    sns.barplot(x=plotdata.index, y=plotdata.values)
    plt.title(title)
    plt.xticks(rotation = xlabel_angle)
    plt.show()

In [None]:
plot_categorical_pie(df, 'TARGET', 'Label Target ', .6)

In [None]:
df['NAME_INCOME_TYPE'].unique()

In [None]:
plot_categorical_pie(df, 'NAME_INCOME_TYPE', 'Income Type', .7)

In [None]:
plot_categorical(df,'NAME_EDUCATION_TYPE', size=[10,6], xlabel_angle=70,title='Education Type')

In [None]:
plot_categorical_pie(df, 'NAME_FAMILY_STATUS', 'Income Type', .4)

In [None]:
corr_matrix = df.corr()

In [None]:
plt.subplots(figsize=(20,12))
sns.heatmap(corr_matrix, cmap = plt.cm.RdYlBu_r, annot = True, vmin = -0.25, vmax=0.6)
plt.title('Correlation Heatmap');

- Draw distribution of numeric features

In [None]:
df['CNT_FAM_MEMBERS'].unique()

In [None]:
df['CNT_FAM_MEMBERS'].plot.hist(title = 'Count of Family members Histogram');
plt.xlabel('Count of family members');

In [None]:
(df['DAYS_BIRTH']/365).plot.hist(title = 'Days Birth Histogram');
plt.xlabel('Days Birth');

In [None]:
# Age information into a separate dataframe
age_data = df[['TARGET', 'DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / -365

# Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
age_data.head(10)

In [None]:
age_groups  = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize = (8, 8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str), 100 * age_groups['TARGET'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group');

- Handling Outlier

In [None]:
sns.boxplot(data=df['DAYS_EMPLOYED'])

In [None]:
df['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment');

In [None]:
round(df[df['DAYS_EMPLOYED'] == 365243]['DAYS_EMPLOYED'].count() / len(df) * 100, 2)

In [None]:
round(df[df['DAYS_EMPLOYED'] != 365243]['DAYS_EMPLOYED'].count() / len(df) * 100 ,2)

- found that DAYS_EMPLOYED has some anomalies
- Around 18% of data amongs all data has some '365243' value in this fields
- as its not make sence to current data so we need to handle it somehow
- so i am replacing this value with np.nan
- creating new column called DAYS_EMPLOYED_ANOM Anomalous flag which will have True or False value based on this field

In [None]:
# Create an anomalous flag column
df['DAYS_EMPLOYED_ANOM'] = df["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

df['DAYS_EMPLOYED']=df['DAYS_EMPLOYED'].fillna(df['DAYS_EMPLOYED'].median())

(df['DAYS_EMPLOYED']/365).plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment');

After removing anomalies we can see above histogram that DAYS_EMPLOYED has maximum as 49 years and minimum is 0 year as discribe below

### creating combined basic features from numerical and categorical

In [None]:
basic_features = app_num_basic_col + app_cat_basic_col 

In [None]:
len(basic_features)

In [None]:
find_missing(df[basic_features])

In [None]:
len(basic_features)

In [None]:
basic_features.append('DAYS_EMPLOYED_ANOM')

In [None]:
len(basic_features)

In [None]:
df[df['DAYS_EMPLOYED'] / -365 > 8]['DAYS_EMPLOYED'].count()

In [None]:
(df['DAYS_BIRTH'] / -365).describe()

In [None]:
df[df['CODE_GENDER'] == 'XNA']

In [None]:
df = df[df['CODE_GENDER'] != 'XNA']

In [None]:
df.shape

### Lable encoding for categorical features whose values are binary like Y/N, Yes/No, True/False, M/F etc.

In [None]:
df[['SK_ID_CURR','CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'DAYS_EMPLOYED_ANOM']].head(10)

In [None]:
# Categorical features with Binary encode (0 or 1; two categories)
for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'DAYS_EMPLOYED_ANOM']:
    df[bin_feature], uniques = pd.factorize(df[bin_feature])

In [None]:
df[['SK_ID_CURR','CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'DAYS_EMPLOYED_ANOM']].head(10)

out of above basic categorical features we already encoded binary 
- FLAG_OWN_CAR
- FLAG_OWN_REALITY
- CODE_GENDER
- DAYS_EMPLYED_ANOM

Now doing one hot encoding for remaining features
- NAME_CONTRACT_TYPE
- NAME_TYPE_SUITE
- NAME_INCOME_TYPE
- NAME_EDUCATION_TYPE
- NAME_FAMILY_STATUS
- NAME_HOUSING_TYPE
- ORGANIZATION_TYPE

In [None]:
one_hot_encode_col = ['NAME_CONTRACT_TYPE',
'NAME_TYPE_SUITE',
'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE',
'ORGANIZATION_TYPE']

In [None]:
dummy_df = pd.get_dummies(df[one_hot_encode_col], dummy_na=False, drop_first=True)

In [None]:
len(dummy_df.columns)

In [None]:
df.shape

In [None]:
len(basic_features)

In [None]:
df.drop(one_hot_encode_col, axis=1,inplace=True)

In [None]:
for f in one_hot_encode_col:
    basic_features.remove(f)

In [None]:
len(basic_features)

In [None]:
df.shape

### creating final dataframe with required features

In [None]:
len(df[basic_features].columns)

In [None]:
len(dummy_df.columns)

In [None]:
df = pd.concat([df[basic_features], dummy_df], axis=1)

In [None]:
del dummy_df
gc.collect()

In [None]:
df.shape

## Model 1 : Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 0.0001)

In [None]:
df.loc[df.TARGET.isnull()].shape

In [None]:
df.loc[df.TARGET.notnull()].shape

In [None]:
X_train = df.loc[df.TARGET.notnull()].drop('TARGET',axis=1)

In [None]:
find_missing(X_train)

In [None]:
y_train =  df.loc[df.TARGET.notnull()]['TARGET']

In [None]:
y_train.shape

In [None]:
X_test = df.loc[df.TARGET.isnull()].drop('TARGET', axis=1)

In [None]:
X_test.shape

In [None]:
# Train on the training data
log_reg.fit(X_train, y_train)

In [None]:
# Make predictions
# Make sure to select the second column only
log_reg_pred = log_reg.predict_proba(X_test)[:,1]

In [None]:
len(log_reg_pred)

In [None]:
log_reg_pred

In [None]:
submit = X_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.to_csv('logistic_regression.csv', index = False)

## Dealing with Imbalance Data using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
y_train.value_counts()

In [None]:
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
submit = X_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.to_csv('logistic_regression_with_smote.csv', index = False)