In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#라이브러리
import numpy as np
import pandas as pd
import time
import gc
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
#화면조정
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',200)

In [None]:
#디렉터리 리스트확인
print(os.listdir("../input/home-credit-default-risk"))

### Application_train과 Application_test 데이터 read_csv

In [None]:
#Application_train 데이터 feature 확인
app_train=pd.read_csv("../input/home-credit-default-risk/application_train.csv")
print('Training data shape: ', app_train.shape)
app_train.head()

In [None]:
#Testing data features 확인
app_test=pd.read_csv("../input/home-credit-default-risk/application_test.csv")
print('Testing data shape: ', app_test.shape)
app_test.head()

## Target column의 분포 시각화
#### Target변수는 0(제때 대출금 상환 가능), 1(상환 어려움)을 예측하는 반응변수

In [None]:
#Target의 0과 1의 수
app_train['TARGET'].value_counts()

In [None]:
app_train['TARGET'].astype(int).plot.hist()
#불균형 데이터임을 시각화로 확인

#### app_train과 app_test를 합쳐서 한번에 데이터 preprocessing 수행
##### 기본 데이터셋 그대로 활용

In [None]:
apps=pd.concat([app_train, app_test])
apps.shape
#307511+48744=356255

In [None]:
apps['TARGET'].value_counts(dropna=False)
#test데이터는 TARGET값이 NaN

In [None]:
apps.info()

#### Object feature(범주형데이터) 처리
##### 두가지 범주만 가지는 변수->Label Encoding
##### 세가지 이상의 범주를 가지는 변수 -> Onehot Encoding

##### 여기에서는 먼저, Feature importance를 보이기 위해 Label Encoding만 하고, 추후에 Onehot Encoding까지 하기로 합니다.
##### 이때만 따로 사용할 apps_train과 apps_test데이터를 만듭니다

In [None]:
object_columns=apps.dtypes[apps.dtypes=='object'].index.tolist()
object_columns

In [None]:
def show_category_by_target(df, columns):
    for column in columns:
        chart = sns.catplot(x=column, col='TARGET', data=df, kind='count')
        chart.set_xticklabels(rotation=65)

show_category_by_target(app_train, object_columns)

In [None]:
object_columns=apps.dtypes[apps.dtypes=='object'].index.tolist()

for column in object_columns:
    apps[column] = pd.factorize(apps[column])[0]

In [None]:
apps.info()

## EDA
#### train data에 대한 EDA
#### Null값 일괄 변환

In [None]:
apps.isnull().sum()

In [None]:
apps=apps.fillna(-999)

#### 학습데이터와 테스트 데이터 다시 분리

In [None]:
apps_train=apps[apps['TARGET']!= -999]
apps_test=apps[apps['TARGET']== -999]
apps_train.shape, apps_test.shape

In [None]:
apps_test=apps_test.drop('TARGET', axis=1, inplace=False)
apps_test.shape

#### train data를 test data로 분리하고 LGBM Classifier로 학습수행
##### ftr_app=피처용 데이터, target_app=타겟 데이터

In [None]:
from sklearn.model_selection import train_test_split

ftr_app = apps_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = apps_train['TARGET']

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape

#### Feature importance 시각화
##### 데이터에 특별한 가공 없는 상태

In [None]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000,
        learning_rate=0.02,
        num_leaves=32,
        subsample=0.8,
        max_depth=12,
        silent=-1,
        verbose=-1
        )

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
        early_stopping_rounds= 50)

In [None]:
from lightgbm import plot_importance
plot_importance( clf, figsize=(16,32))

#### feature들에 대해 TARGET값 분포도 시각화

In [None]:
def show_hist_by_target(df, columns):
    cond_1 = (df['TARGET'] == 1)
    cond_0 = (df['TARGET'] == 0)
    
    for column in columns:
        fig, ax = plt.subplots(figsize=(12, 4), nrows=1, ncols=2, squeeze=False)
        sns.violinplot(x='TARGET', y=column, data=df, ax=ax[0][0])
        sns.distplot(df[cond_1][column], label='1', color='red', ax=ax[0][1])
        sns.distplot(df[cond_0][column], label='0', color='blue', ax=ax[0][1])

In [None]:
# Feature Importances 에서 중요하게 나왔던 상위 20개의 columns

columns = ['AMT_INCOME_TOTAL','AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH',
           'DAYS_REGISTRATION', 'DAYS_LAST_PHONE_CHANGE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'EXT_SOURCE_1', 
           'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 
           'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']

show_hist_by_target(app_train, columns)

#### 상관관계가 높았던 3개의 변수 EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3의 시각화

In [None]:
# TARGET 변수와 EXT_SOURCE와의 상관관계와 EXT_SOURCE 서로간의 상관관계를 살핌

ext_data=app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs=ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize=(8,6))

sns.heatmap(ext_data_corrs, cmap=plt.cm.RdYlBu_r, vmin=-0.25, annot=True, vmax=0.6)
plt.title('Correlation Heatmap');

In [None]:
plt.figure(figsize=(10,12))

# iterate through the sources
for i, source in enumerate(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']):
    plt.subplot(3,1,i+1)
    
    sns.kdeplot(app_train.loc[app_train['TARGET']==0,source],label='target==0')
    sns.kdeplot(app_train.loc[app_train['TARGET']==1,source],label='target==1')
    
    plt.title('Distribution of %s by Target Value' % source)
    plt.xlabel('%s' %source);
    plt.ylabel('Density');
plt.tight_layout(h_pad=2.5)

#### 이 3가지 변수 외에 DAYS_BIRTH 의 TARGET에 대한 상관관계가 높은 편이었으므로 DAYS_BIRTH의 특성도 파악해보고자 합니다

In [None]:
app_train['DAYS_BIRTH']=abs(app_train['DAYS_BIRTH'])
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])

In [None]:
plt.style.use('fivethirtyeight')

# 고객 나이에 대한 히스토그램 분포 확인
plt.hist(app_train['DAYS_BIRTH']/365, edgecolor='k',bins=25)
plt.title('Age of Client');
plt.xlabel('Age (years)');
plt.ylabel('Count');

## 40대 고객이 가장 많은 것을 확인 할 수 있다.

In [None]:
plt.figure(figsize=(10,8))

# 제때 대출을 상환하는 고객의 나이 plot (TARGET=0)
sns.kdeplot(app_train.loc[app_train['TARGET']==0,'DAYS_BIRTH']/365,label='target==0')

# 제때 대출을 상환하지못하는 고객의 나이 plot (TARGET=1)
sns.kdeplot(app_train.loc[app_train['TARGET']==1,'DAYS_BIRTH']/365,label='target==1')

plt.xlabel('Age(years)');
plt.ylabel('Density');
plt.title('Distribution of Ages')

## 30대가 제때 대출을 상환하지 못하는 고객이 가장 많다는 것을 알 수 있다.

#### Feature Engineering
##### apps 전체 데이터에 대해서

##### EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3 가공
##### AMT_CREDIT 비율로 가공
##### AMT_INCOME_TOTAL 비율로 가공
##### DAYS_BIRTH, DAYS_EMPLOYED 비율로 가공

In [None]:
apps.info()

In [None]:
def get_apps_processed(apps):
    
    # EXT_SOURCE_X FEATURE 가공
    apps['APPS_EXT_SOURCE_MEAN'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps['APPS_EXT_SOURCE_STD'].fillna(apps['APPS_EXT_SOURCE_STD'].mean())
    
    # AMT_CREDIT 비율로 Feature 가공
    apps['APPS_ANNUITY_CREDIT_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_CREDIT']
    apps['APPS_GOODS_CREDIT_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_CREDIT']
    
    # AMT_INCOME_TOTAL 비율로 대출 금액 관련 Feature 가공
    apps['APPS_ANNUITY_INCOME_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_INCOME_TOTAL']
    apps['APPS_CREDIT_INCOME_RATIO'] = apps['AMT_CREDIT']/apps['AMT_INCOME_TOTAL']
    apps['APPS_GOODS_INCOME_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_INCOME_TOTAL']
    apps['APPS_CNT_FAM_INCOME_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['CNT_FAM_MEMBERS']
    
    # DAYS_BIRTH, DAYS_EMPLOYED 비율로 소득/자산 관련 Feature 가공
    apps['APPS_EMPLOYED_BIRTH_RATIO'] = apps['DAYS_EMPLOYED']/apps['DAYS_BIRTH']
    apps['APPS_INCOME_EMPLOYED_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_EMPLOYED']
    apps['APPS_INCOME_BIRTH_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_BIRTH']
    apps['APPS_CAR_BIRTH_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_BIRTH']
    apps['APPS_CAR_EMPLOYED_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_EMPLOYED']
    
    return apps

#### previous_application data와 JOIN
##### SK_ID_CURR로 MERGE

In [None]:
prev_app = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')
prev_app.shape

#### TARGET유형에 따라 숫자형 피처들의 histogram 확인

In [None]:
# prev_app와 app_train을 inner 조인 ==> TARGET 값은 application_train(app_train) 데이터에만 있음
# ==> prev_app안에 TARGET 유형에 따라 FEATURE 확인을 하려면 조인할 필요가 있음

# app_train[['SK_ID_CURR', 'TARGET']] 두가지 컬럼만 갖고 옴
# on='SK_ID_CURR' 컬럼 기준으로 조인

app_prev_target = prev_app.merge(app_train[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')
app_prev_target.shape

In [None]:
def show_hist_by_target(df, columns):
    cond_1 = (df['TARGET'] == 1)
    cond_0 = (df['TARGET'] == 0)
    
    for column in columns:
        fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), squeeze=False)
        sns.violinplot(x='TARGET', y=column, data=df, ax=axs[0][0] )
        sns.distplot(df[cond_0][column], ax=axs[0][1], label='0', color='blue')
        sns.distplot(df[cond_1][column], ax=axs[0][1], label='1', color='red')

In [None]:
# 숫자형 dtype 갖고 있는 컬럼만 추출
num_columns = app_prev_target.dtypes[app_prev_target.dtypes != 'object'].index.tolist()
num_columns = [column for column in num_columns if column not in ['SK_ID_PREV', 'SK_ID_CURR', 'TARGET']]
num_columns

In [None]:
show_hist_by_target(app_prev_target, num_columns)

In [None]:
app_prev_target.TARGET.value_counts()

In [None]:
 # AMT_ANNUITY, AMT_CREDIT, AMT_APPLICATION, AMT_CREDIT는 TARGET=1일 경우에 소액 비율이 약간 높음(큰 차이는 아님)

print(app_prev_target.groupby('TARGET').agg({'AMT_ANNUITY': ['mean', 'median', 'count']}))
print(app_prev_target.groupby('TARGET').agg({'AMT_APPLICATION': ['mean', 'median', 'count']}))
print(app_prev_target.groupby('TARGET').agg({'AMT_CREDIT': ['mean', 'median', 'count']}))

#### TARGET 유형에 따라 Category 피처들의 Histogram을 비교

In [None]:
object_columns = app_prev_target.dtypes[app_prev_target.dtypes=='object'].index.tolist()
object_columns

In [None]:
show_category_by_target(app_prev_target, object_columns)


## prev_app data를 이용한 Feature Engineering

In [None]:
prev_app.groupby('SK_ID_CURR')

In [None]:
# prev 피처 가공 / 대출 신청액 대비 다른 금액 차이 및 비율 생성
prev_app['PREV_CREDIT_DIFF'] = prev_app['AMT_APPLICATION'] - prev_app['AMT_CREDIT']
prev_app['PREV_GOODS_DIFF'] = prev_app['AMT_APPLICATION'] - prev_app['AMT_GOODS_PRICE']
prev_app['PREV_CREDIT_APPL_RATIO'] = prev_app['AMT_CREDIT']/prev_app['AMT_APPLICATION']
prev_app['PREV_ANNUITY_APPL_RATIO'] = prev_app['AMT_ANNUITY']/prev_app['AMT_APPLICATION']
prev_app['PREV_GOODS_APPL_RATIO'] = prev_app['AMT_GOODS_PRICE']/prev_app['AMT_APPLICATION']

In [None]:
# DAYS_XXX 피처의 365243 을 NULL로 변환하고, 첫번째 만기일과 마지막 만기일까지의 기간 가공
prev_app['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
prev_app['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev_app['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev_app['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev_app['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# 첫번째 만기일과 마지막 만기일까지의 기간
prev_app['PREV_DAYS_LAST_DUE_DIFF'] = prev_app['DAYS_LAST_DUE_1ST_VERSION'] - prev_app['DAYS_LAST_DUE']

In [None]:
all_pay = prev_app['AMT_ANNUITY'] * prev_app['CNT_PAYMENT']
prev_app['PREV_INTERESTS_RATE'] = (all_pay/prev_app['AMT_CREDIT'] - 1)/prev_app['CNT_PAYMENT']

In [None]:
prev_app.iloc[:, -7:].head(10)

In [None]:
agg_dict = {
     # 기존 컬럼. 
    'SK_ID_CURR':['count'],
    'AMT_CREDIT':['mean', 'max', 'sum'],
    'AMT_ANNUITY':['mean', 'max', 'sum'], 
    'AMT_APPLICATION':['mean', 'max', 'sum'],
    'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
    'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
    # 가공 컬럼
    'PREV_CREDIT_DIFF':['mean', 'max', 'sum'], 
    'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
    'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
    'PREV_GOODS_APPL_RATIO':['mean', 'max'],
    'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
    'PREV_INTERESTS_RATE':['mean', 'max']
}

prev_group = prev_app.groupby('SK_ID_CURR')
prev_amt_agg = prev_group.agg(agg_dict)
prev_amt_agg.head()

In [None]:
prev_amt_agg.columns = ['PREV_'+('_').join(column).upper() for column in prev_amt_agg.columns.ravel()]
prev_amt_agg.head()

In [None]:
prev_amt_agg.shape

#### NAME_CONTRACT_STATUS = Refused 일 때

In [None]:
cond_refused = (prev_app['NAME_CONTRACT_STATUS'] == 'Refused')
prev_refused = prev_app[cond_refused]
prev_refused.shape, prev_app.shape

In [None]:
prev_refused_agg = prev_refused.groupby('SK_ID_CURR')['SK_ID_CURR'].count()
prev_refused_agg = prev_refused_agg.reset_index(name='PREV_REFUSED_COUNT')
prev_amt_agg = prev_amt_agg.reset_index()

prev_amt_refused_agg = prev_amt_agg.merge(prev_refused_agg, on='SK_ID_CURR', how='left')
prev_amt_refused_agg['PREV_REFUSED_COUNT'].value_counts(dropna=False)

In [None]:
# Null값을 0으로 변경
# SK_ID_CURR 개별 건수 대비 PREV_REFUSED_COUNT 비율 계산
prev_amt_refused_agg = prev_amt_refused_agg.fillna(0)
prev_amt_refused_agg['PREV_REFUSE_RATIO'] = prev_amt_refused_agg['PREV_REFUSED_COUNT'] / prev_amt_refused_agg['PREV_SK_ID_CURR_COUNT']

In [None]:
# 세부 레벨 groupby 와 unstack()을 이용하여 Group by Case 구현 >> 세부 조건이 2개 이상일때
prev_refused_appr_group = prev_app[prev_app['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].groupby(['SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()

In [None]:
# 컬럼명 변경, Null 처리, 그리고 기존의 prev_amt_agg와 조인 후 데이터 가공
prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)
prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT']
prev_refused_appr_agg = prev_refused_appr_agg.reset_index()

In [None]:
prev_amt_agg.head()

In [None]:
# prev_amt_agg와 조인. prev_amt_agg와 prev_refused_appr_agg 모두 SK_ID_CURR을 INDEX로 가지고 있음 
prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on='SK_ID_CURR', how='left')

# SK_ID_CURR별 과거 대출건수 대비 APPROVED_COUNT 및 REFUSED_COUNT 비율 생성
prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']

# 'PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT' 컬럼 drop 
prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis=1)

# prev_amt_agg와 prev_refused_appr_agg INDEX인 SK_ID_CURR이 조인 후 정식 컬럼으로 생성됨 
prev_agg.head(30)

#### 가공된 최종 데이터

In [None]:
# 위에서 이미 객체를 numeric으로 바꿨으므로 다시 한번 불러옵니다
apps = pd.concat([app_train, app_test])
apps.info()

In [None]:
apps_all =  get_apps_processed(apps)
apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')
apps_all.info()

In [None]:
object_columns = apps_all.dtypes[apps_all.dtypes == 'object'].index.tolist()
for column in object_columns:
    apps_all[column] = pd.factorize(apps_all[column])[0]
    
apps_all.info()

In [None]:
apps_all_train = apps_all[~apps_all['TARGET'].isnull()]
apps_all_test = apps_all[apps_all['TARGET'].isnull()]

apps_all_test = apps_all_test.drop('TARGET', axis=1)

In [None]:
apps_all_train.shape, apps_all_test.shape

In [None]:
from sklearn.model_selection import train_test_split

ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = apps_all_train['TARGET']

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape

In [None]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000,
        learning_rate=0.02,
        num_leaves=32,
        subsample=0.8,
        max_depth=12,
        silent=-1,
        verbose=-1
        )

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
        early_stopping_rounds= 50)

In [None]:
from lightgbm import plot_importance

plot_importance(clf, figsize=(16, 32))

In [None]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau_bal = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')

In [None]:
bureau.shape, bureau_bal.shape

#### bureau데이터의 EDA

In [None]:
# TARGET 값을 가져오기 위해 bureau를 apps와 조인
app_bureau = bureau.merge(app_train[['SK_ID_CURR', 'TARGET']], left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='inner')
app_bureau.shape

In [None]:
f,ax=plt.subplots(1,2,figsize=(12,6))
app_train.TARGET.value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Distribution of Target')
ax[0].set_ylabel('')
sns.countplot('TARGET',data=app_train,ax=ax[1])
ax[1].set_title('Target count')
plt.show()


In [None]:
# Null 값 확인 가능한 사용자 함수 (데이터에서 비율)

def nulldata(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    ms=pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    ms= ms[ms["Percent"] > 0]
    f,ax =plt.subplots(figsize=(15,10))
    plt.xticks(rotation='90')
    fig=sns.barplot(ms.index, ms["Percent"],color="green",alpha=0.8)
    plt.xlabel('Features', fontsize=15)
    plt.ylabel('Percent of null values', fontsize=15)
    plt.title('Percent null data by feature', fontsize=15)
    return ms
nulldata(bureau)

In [None]:
# Numerical Features(숫자형 피처) 과 Categorical Features(범주/object 피처) 구분 함수
# 다른 데이터에 사용 가능하니, 기말프로젝트에 사용해도 좋은

def type_features(data):
    categorical_features = data.select_dtypes(include = ["object"]).columns
    numerical_features = data.select_dtypes(exclude = ["object"]).columns
    print( "categorical_features :",categorical_features)
    print('-----'*40)
    print("numerical_features:",numerical_features)

type_features(bureau)

In [None]:
# 시각화 함수
def plot_re(df,t1='',t2=''):
    f,ax=plt.subplots(1,2,figsize=(12,8))
    df[[t1,t2]].groupby([t1]).count().plot.bar(ax=ax[0],color='Blue')
    ax[0].set_title('count of customer on '+t1)
    sns.countplot(t1,hue=t2,data=df,ax=ax[1],palette="spring")
    ax[1].set_title(t1+': Target 0 vs Target 1')
    # Rotate x-labels
    plt.xticks(rotation=-90)
    a=plt.show()
    return a

plot_re(app_bureau,'CREDIT_ACTIVE','TARGET')


#### Numerical Feature 시각화

In [None]:
f, ax = plt.subplots(2,3,figsize=(13, 10))


# DAYS_CREDIT: 현재 대출 신청 일 기준 과거 대출 신청 지난 기간 분포 확인
sns.distplot(app_bureau.DAYS_CREDIT.dropna(), kde=True, color="g", 
             ax=ax[0,0]).set_title('DAYS CREDIT Distribution')

# CREDIT_DAY_OVERDUE: 대출 신청 시 CB 크레딧 연체 일수
sns.distplot(app_bureau.CREDIT_DAY_OVERDUE.dropna(), kde=True, color="b",
             ax=ax[0,1]).set_title(' CREDIT DAY OVERDUE Distribution')


# DAYS_CREDIT_UPDATE: 대출 신청전 마지막 정보 받은 기간
sns.distplot(bureau.DAYS_CREDIT_UPDATE.dropna(), kde=True, 
             color="r", ax=ax[0,2]).set_title('DAYS CREDIT UPDATE Distribution')


# AMT_CREDIT_SUM_LIMIT: 신용 카드 현재 신용한도
sns.distplot(bureau.AMT_CREDIT_SUM_LIMIT.dropna(), kde=True, color="g", 
             ax=ax[1,0]).set_title(' Distribution')


# AMT_CREDIT_SUM_DEBT: 현재 채무 금액 총액
sns.distplot(bureau.AMT_CREDIT_SUM_DEBT.dropna(), kde=True, color="b",
             ax=ax[1,1]).set_title(' Distribution')

 
# AMT_CREDIT_SUM_OVERDUE: 최대 연체금액
sns.distplot(bureau.AMT_CREDIT_SUM_OVERDUE.dropna(), kde=True, 
             color="r", ax=ax[1,2]).set_title('DAYS CREDIT UPDATE Distribution')

In [None]:
# 신용(크레딧)이 몇 번 연장을 했는지, 연장되었다면 어떤 대출 type 인지 알아보는 시각화

# CNT_CREDIT_PROLONG 신용 연장 횟수
# CREDIT_TYPE 대출 유형

sns.stripplot(x="CNT_CREDIT_PROLONG", y="CREDIT_TYPE", data=app_bureau)
plt.show()

In [None]:
# correlated features

corrmat = app_bureau.corr()
top_corr_features = corrmat.index[abs(corrmat["TARGET"])>=0.03]
plt.figure(figsize=(12,8))
g = sns.heatmap(app_bureau[top_corr_features].corr(),annot=True,cmap="Oranges")

#### bureau_balance 데이터 EDA

In [None]:
type_features(bureau_bal)

In [None]:
total = bureau_bal.isnull().sum().sort_values(ascending = False)
percent = (bureau_bal.isnull().sum()/bureau_bal.isnull().count()*100).sort_values(ascending = False)
ms=pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
ms= ms[ms["Percent"] > 0]
ms

In [None]:
f, ax = plt.subplots(figsize=(7,5))

# Types of colors
color_types = ['#78C850','#F08030','#6890F0','#A8B820','#A8A878','#A040A0','#F8D030',  
                '#E0C068','#EE99AC','#C03028','#F85888','#B8A038','#705898','#98D8D8','#7038F8']

# Count Plot 
sns.countplot(x='STATUS', data=bureau_bal,palette=color_types).set_title('count based on status type')

#### Bureau Feature Engineering

In [None]:
# bureau_bal 데이터와 bureau 데이터와 join
bureau_bal = bureau_bal.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], on='SK_ID_BUREAU', how='left')
bureau_bal.shape

In [None]:
# 고객 당 과거 대출 수
bureau_fe1= bureau
PAST_LOANS_PER_CUS= bureau_fe1[['SK_ID_CURR', 'DAYS_CREDIT']].groupby(by = ['SK_ID_CURR'])['DAYS_CREDIT'].count().reset_index().rename(index=str, columns={'DAYS_CREDIT': 'BUREAU_LOAN_COUNT'})
bureau_fe1 = bureau_fe1.merge(PAST_LOANS_PER_CUS, on = ['SK_ID_CURR'], how = 'left')
print(bureau.shape)
print(bureau_fe1.shape)

In [None]:
# 고객 당 과거 대출 유형 수
BUREAU_LOAN_TYPES = bureau_fe1[['SK_ID_CURR', 'CREDIT_TYPE']].groupby(by = ['SK_ID_CURR'])['CREDIT_TYPE'].nunique().reset_index().rename(index=str, columns={'CREDIT_TYPE': 'BUREAU_LOAN_TYPES'})
bureau_fe1 = bureau_fe1.merge(BUREAU_LOAN_TYPES, on = ['SK_ID_CURR'], how = 'left')
print(bureau_fe1.shape)

In [None]:
# 고객 당 유형별 평균 과거 대출 수 = 고객 당 과거 대출 수 / 고객 당 과거 대출 유형 수
# --> 고객이 다양한 유형의 대출을 받거나 단일 유형의 대출에 집중하고 있는지 확인가능

bureau_fe1['AVERAGE_LOAN_TYPE'] = bureau_fe1['BUREAU_LOAN_COUNT']/bureau_fe1['BUREAU_LOAN_TYPES']

In [None]:
# 필요없는 컬럼은 삭제

del bureau_fe1['BUREAU_LOAN_COUNT'], bureau_fe1['BUREAU_LOAN_TYPES']

In [None]:
# BUREAU 데이터의 active 대출 비율 (CREDIT이 ACTIVE 또는 CLOSED인지)


bureau_fe1['CREDIT_ACTIVE_CLOSED'] = bureau_fe1['CREDIT_ACTIVE']

# Closed 인 경우에는 y에는 0, ACTIVE 인경우에는 Y=1 
def f(x):
    if x == 'Closed':
        y = 0
    else:
        y = 1    
    return y

bureau_fe1['CREDIT_ACTIVE_CLOSED'] = bureau_fe1.apply(lambda x: f(x.CREDIT_ACTIVE), axis = 1)

# CUSTOMER 당 활성 상태인 평균 대출 수 계산
grp = bureau_fe1.groupby(by = ['SK_ID_CURR'])['CREDIT_ACTIVE_CLOSED'].mean().reset_index().rename(index=str, columns={'CREDIT_ACTIVE_CLOSED':'ACTIVE_LOANS_PERCENTAGE'})
bureau_fe1= bureau_fe1.merge(grp, on = ['SK_ID_CURR'], how = 'left')
# 중복된 컬럼 삭제
del bureau_fe1['CREDIT_ACTIVE_CLOSED']
print(bureau_fe1.shape)

In [None]:
# bureau 채무 완료 날짜 및 대출 금액 대비 채무 금액 관련 피처 가공

# 예정 채무 시작 및 완료일과 실제 채무 완료일간의 차이 및 날짜 비율 가공.  
bureau_fe1['BUREAU_ENDDATE_FACT_DIFF'] = bureau_fe1['DAYS_CREDIT_ENDDATE'] - bureau_fe1['DAYS_ENDDATE_FACT']
bureau_fe1['BUREAU_CREDIT_FACT_DIFF'] = bureau_fe1['DAYS_CREDIT'] - bureau_fe1['DAYS_ENDDATE_FACT']
bureau_fe1['BUREAU_CREDIT_ENDDATE_DIFF'] = bureau_fe1['DAYS_CREDIT'] - bureau_fe1['DAYS_CREDIT_ENDDATE']

# 채무 금액 대비/대출 금액 비율 및 차이 가공
bureau_fe1['BUREAU_CREDIT_DEBT_RATIO'] = bureau_fe1['AMT_CREDIT_SUM_DEBT'] / bureau_fe1['AMT_CREDIT_SUM']
bureau_fe1['BUREAU_CREDIT_DEBT_DIFF'] = bureau_fe1['AMT_CREDIT_SUM_DEBT'] - bureau_fe1['AMT_CREDIT_SUM']

In [None]:
# 신용 연장 평균 횟수 
bureau_fe1['CNT_CREDIT_PROLONG'] = bureau_fe1['CNT_CREDIT_PROLONG'].fillna(0)
grp = bureau_fe1[['SK_ID_CURR', 'CNT_CREDIT_PROLONG']].groupby(by = ['SK_ID_CURR'])['CNT_CREDIT_PROLONG'].mean().reset_index().rename(index = str, columns = { 'CNT_CREDIT_PROLONG':'AVG_CREDITDAYS_PROLONGED'})
bureau_fe1 = bureau_fe1.merge(grp, on = ['SK_ID_CURR'], how = 'left')
print(bureau_fe1.shape)

#### 학습된 Classifier를 이용하여 테스트 데이터 예측하고 결과를 Kaggle로 Submit 수행

In [None]:
preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis=1))[:, 1 ]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('prev_baseline_03.csv', index=False)