In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
train.head()

In [None]:
#remove columns with mode and median building information 
dels = ['APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 
        'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 
        'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 
        'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 
        'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 
        'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 
        'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE',
        'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
        'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 
        'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 
        'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']


train1 = train.drop(train[dels], axis =1)
train1.shape

* This dataset consists of 307511 rows and 122 columns
* Each row has unique id (SK_ID_CURR) and the output label (TARGET)
* TARGET indicates by 0 (loan was repaid) or 1 (loan was not repaid)

#### Target Distribution

In [None]:
#Find proportion of target variable
(train1['TARGET'].value_counts() / len(train1)).to_frame()

#### Additional data exploration

In [None]:
train1.info(max_cols = 100)

In [None]:
#find missing values
count = train1.isnull().sum().sort_values(ascending = False)
percentage = ((train1.isnull().sum() / len(train1) * 100)).sort_values(ascending = False)
missing = pd.concat([count, percentage], axis = 1, keys = ['Count','Percentage'])
missing.head(35)

Of the 34 columns with missing data, 14 features have more than 50% missing values

#### Exploratory Visualizations

In [None]:
#distribution of credit amounts
sns.displot(x ='AMT_CREDIT', x = train1, bins = 100)

Credit levels are right skewed and outliers exist

In [None]:
#distribution of age
sns.displot(x = 'DAYS_BIRTH', data = train1, bins = 100)

Convert age to years from days

In [None]:
train1.groupby(['OCCUPATION_TYPE'])['OCCUPATION_TYPE'].count().sort_values(ascending=False).plot(kind='barh', figsize=(8,6))
plt.show()

In [None]:
train1.groupby(['NAME_EDUCATION_TYPE'])['NAME_EDUCATION_TYPE'].count().sort_values(ascending=False).plot(kind='barh')
plt.show()

In [None]:
train1.groupby(['NAME_FAMILY_STATUS'])['NAME_FAMILY_STATUS'].count().sort_values(ascending=False).plot(kind='barh')
plt.show()


In [None]:
#visualize income vs loan amount, identified by default
fig, ax = plt.subplots(figsize=(10, 10))
a = sns.scatterplot(x = 'AMT_INCOME_TOTAL', y = 'AMT_CREDIT', data = train1, hue = 'TARGET')
a.set(xlim=(0, 1000000))


In [None]:
plt.subplots(figsize=(10, 10))
b = sns.boxplot(x = 'WEEKDAY_APPR_PROCESS_START',
            y = 'AMT_GOODS_PRICE',
            hue = 'NAME_CONTRACT_TYPE',
            data = train1,
           palette = ['m', 'g'])
b.set(ylim=(0, 1000000))

In [None]:
plt.subplots(figsize=(10, 10))
c = sns.boxplot(x = train1['TARGET'],
            y = train1['AMT_CREDIT'],
            hue = train1['WEEKDAY_APPR_PROCESS_START'])

c.set(ylim=(0, 1750000))

In [None]:
plt.subplots(figsize=(10, 10))
d = sns.boxplot(x = 'NAME_EDUCATION_TYPE',
            y = 'AMT_CREDIT',
            hue = 'NAME_FAMILY_STATUS',
            data = train1)
d.set(ylim=(0, 2000000))

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
e = sns.scatterplot(x = 'AMT_CREDIT', y = "AMT_INCOME_TOTAL", data = train1, hue = 'NAME_EDUCATION_TYPE', size = 'DAYS_BIRTH', alpha = 0.6)
e.set(ylim=(25000, 600000))

In [None]:
sns.barplot(data = train1, x = 'AMT_CREDIT', y = 'WEEKDAY_APPR_PROCESS_START', hue = 'TARGET')

In [None]:
sns.barplot(x = 'TARGET', y = 'DAYS_BIRTH', data = train1)

## Feature Correlations

In [None]:
corr1 = train1.corr()['TARGET'].sort_values()

#strongest negative correlation
print('Features with Strongest Negative Correlation:')
corr1.head()

In [None]:
neg = train1[['TARGET','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_EMPLOYED']]

neg_corr = neg.corr()

sns.heatmap(neg_corr, annot = True)

In [None]:
#strongest positive correlation
print('Features with Strongest Positive Correlation:')
corr1.tail()

In [None]:
pos = train1[['TARGET','DAYS_LAST_PHONE_CHANGE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'DAYS_BIRTH']]

pos_corr = pos.corr()

sns.heatmap(pos_corr, annot = True)

In [None]:
feat_corr = pd.DataFrame(train1).corr()

In [None]:
plt.subplots(figsize=(16, 16))
sns.heatmap(feat_corr, annot = False, label = 'small')

In [None]:
corr_df = feat_corr.where(np.triu(np.ones(feat_corr.shape), k = 1).astype(np.bool))
corr_df = corr_df.unstack().reset_index()
corr_df.columns = ['Feature A', 'Feature B', 'Correlation']
corr_df.dropna(subset = ['Correlation'], inplace = True)
corr_df['Correlation'] = round(corr_df['Correlation'], 2)
corr_df['Correlation'] = abs(corr_df['Correlation'])
matrix = corr_df.sort_values(by = 'Correlation', ascending = False)
max_corr = matrix[matrix['Correlation'] > 0.75]
max_corr

Consider dropping columns identified above. Potentially:
* livingapartments_avg
* livingarea_avg
* cnt_fam_members
* def_30_cent_social_circle
* elevators_avg



### Data Preparation & Feature Engineering

In [None]:
#replace 365243 in days employed with nan
train1['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

#convert age to years
train1['AGE'] = train1['DAYS_BIRTH'] / - 365

#set max income to 2.5 million
train1 = train1[train1['AMT_INCOME_TOTAL'] < 2500000]

In [None]:
#drop features
drop_list = ['DAYS_BIRTH', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 
             'CNT_FAM_MEMBERS',  'OBS_30_CNT_SOCIAL_CIRCLE',
             'OBS_60_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', 
]

train1 = train1.drop(drop_list, axis =1)
train1.info(max_cols = 100)

In [None]:
#create credit/annuity ratio feature
train1['CA_RATIO'] = train1['AMT_CREDIT'] / train1['AMT_ANNUITY']

#create credit/cost of goods ratio feature
train1['CG_RATIO'] = train1['AMT_CREDIT'] / train1['AMT_GOODS_PRICE']

#create avg of each row of EXIT_SOURCE values
train1['AVG_EXT'] = train1.iloc[:, 39:42].sum(axis=1)/(3- train1.iloc[:,39:42].isnull().sum(axis=1))
train1.EXT_SOURCE_1.fillna(train1.AVG_EXT, inplace=True)
train1.EXT_SOURCE_2.fillna(train1.AVG_EXT, inplace=True)
train1.EXT_SOURCE_3.fillna(train1.AVG_EXT, inplace=True)

train1.info(max_cols = 75)

In [None]:
#convert catergorical festures to cat
#cat_cols = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 
           # 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
           # 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
           # 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

#train1[cat_cols] = train1[cat_cols].astype('category')

In [None]:
#one hot encoder function
#def OHE(df, nan_as_category = True):
    #columns = list(df.columns)
    #cat_cols = df.select_dtypes(['category', 'object']).columns.tolist()

#eliminating outliers for numeric variables ## 
import scipy.stats as stats

Q1 = train1.quantile(q=.25)
Q3 = train1.quantile(q=.75)
IQR = train1.apply(stats.iqr)

#only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
train_clean = train1[~((train1 < (Q1-1.5*IQR)) | (train1 > (Q3+1.5*IQR))).any(axis=1)]

#find how many rows are left in the dataframe 
train_clean.shape

In [None]:
#####################################

## Credit Card Balance

In [None]:
#read credit card balance dataset into the notebook
cc_balance = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')
cc_balance.head()

In [None]:
#print shape of both datasets
print('Credit Card Balance Shape:', cc_balance.shape)


In [None]:
#create late payment feature
cc_balance['LATE PYMT'] = cc_balance['SK_DPD'].apply(lambda x:1 if x>0 else 0)

#create card use limit feature
cc_balance['USE_LIMIT'] = cc_balance['AMT_BALANCE'] / cc_balance['AMT_CREDIT_LIMIT_ACTUAL']

In [None]:
#group numerical features by SK_ID_CURR
cc_num = cc_balance.groupby(by = ['SK_ID_CURR']).agg(['min', 'max', 'mean']).reset_index()
cc_num

In [None]:
#group categorical features by SK_ID_CURR
cc_cat = pd.get_dummies(cc_balance.select_dtypes('object'))
cc_cat['SK_ID_CURR'] = cc_balance['SK_ID_CURR']
cc_cat = cc_cat = cc_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()
cc_cat


In [None]:
#merge cc_balance features into training dataset
train1 = train1.merge(cc_num, on = 'SK_ID_CURR', how = 'left')
train1 = train1.merge(cc_cat, on = 'SK_ID_CURR', how = 'left')

In [None]:
train1.info(max_cols = 300)

### Bureau and Bureau Balance

In [None]:
#read bureau balance dataset into the notebook
b_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
b_balance.head()

In [None]:
b_balance.info()

In [None]:
print('Bureau Balance Shape:', b_balance.shape)

In [None]:
#read bureau balance dataset into the notebook
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau.head()

In [None]:
print('Bureau Shape:', bureau.shape)

In [None]:
bureau.info()

In [None]:
#merge bureau and bureau balance
bureau = bureau.merge(b_balance, on = 'SK_ID_BUREAU', how = 'left')

In [None]:
bureau.head()

#### Data Preparation and Feature Engineering

In [None]:
#create late payment feature
cc_balance['LATE PYMT'] = cc_balance['SK_DPD'].apply(lambda x:1 if x>0 else 0)

#create card use limit feature
cc_balance['USE_LIMIT'] = cc_balance['AMT_BALANCE'] / cc_balance['AMT_CREDIT_LIMIT_ACTUAL']

In [None]:
cc_bal1 = cc_balance[['SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL',
                      'AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
                      'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_CURRENT', 
                      'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 
                      'AMT_TOTAL_RECEIVABLE']]

In [None]:
#Aggregate rows and group by current ID
cc_agg = cc_bal1.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum'])

In [None]:
cc_agg

In [None]:
# Combining numerical features
#grp = cc_balance.drop('SK_ID_PREV', axis =1).groupby(by=['SK_ID_CURR']).mean().reset_index()
#prev_columns = ['CC_'+column if column != 'SK_ID_CURR' else column for column in grp.columns ]
#grp.columns = prev_columns
#train_cc = train1.merge(grp, on =['SK_ID_CURR'], how = 'left')
#train_cc.update(train[grp.columns].fillna(0))


# Combining categorical features
#cc_cat = pd.get_dummies(cc_balance.select_dtypes('object'))
#cc_cat['SK_ID_CURR'] = cc_balance['SK_ID_CURR']
#grp = cc_cat.groupby('SK_ID_CURR').mean().reset_index()
#grp.columns = ['CC_'+column if column != 'SK_ID_CURR' else column for column in grp.columns]
#train_cc = train1.merge(grp, on=['SK_ID_CURR'], how='left')
#train1.update(train1[grp.columns].fillna(0))