#### 라이브러리 설치 등 기본 설정

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import gc
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from lightgbm import plot_importance
from lightgbm import LGBMClassifier

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# 화면 크기 설정

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

#### 데이터 불러오기

In [None]:
# application data load

app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
apps = pd.concat([app_train, app_test])   # 전처리를 위해 훈련데이터와 테스트데이터 결합

# previous data load

prev_app = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')

# bureau data load

bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau_bal = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')

#### EDA; application

In [None]:
print('training : ', app_train.shape, '\n')
print(app_train.info(),'\n')

app_train.head()

In [None]:
print('test : ', app_test.shape, '\n')
print(app_test.info(),'\n')

app_test.head()

In [None]:
# columns 확인

app_train.columns.values

In [None]:
# 결측치 확인

app_train.isnull().sum()

In [None]:
# target 값 확인
## 0과 1의 개수가 확연히 차이남
## 0 : 갚을 수 있음, 1 : 갚기 어려움

print(app_train['TARGET'].value_counts(), '\n')

app_train['TARGET'].astype(int).plot.hist()

In [None]:
# 수입에 따른 타겟 값
## 전체 범위로 하면 한 쪽으로 몰려있음. INCOME 값의 범위 줄일수록 고른 분포 나타남.

app_train[app_train['TARGET'] == 1]['AMT_INCOME_TOTAL'].hist()

In [None]:
# TARGET = 0일 떄 부동산 소유 여부 분포 확인

app_train['FLAG_OWN_REALTY'][app_train['TARGET'] == 0].hist()

In [None]:
## TARGET 값이 0일 때와 1일 때 모두 부동산을 실제로 가지고 있는 확률이 더 높음. 

print(app_train['FLAG_OWN_REALTY'].value_counts()/app_train.shape[0], '\n')

print(app_train['FLAG_OWN_REALTY'][app_train['TARGET'] == 1].value_counts()/app_train.shape[0], '\n')

print(app_train['FLAG_OWN_REALTY'][app_train['TARGET'] == 0].value_counts()/app_train.shape[0], '\n')

In [None]:
# 'TARGET'값에 따른 각 컬럼의 분포 시각화 함수

def show_hist_by_target(df, columns):
    cond_1 = (df['TARGET'] == 1)
    cond_0 = (df['TARGET'] == 0)
    
    for column in columns:
        fig, ax = plt.subplots(figsize=(12, 4), nrows=1, ncols=2, squeeze=False)
        sns.violinplot(x='TARGET', y=column, data=df, ax=ax[0][0])
        sns.distplot(df[cond_1][column], label='1', color='red', ax=ax[0][1])
        sns.distplot(df[cond_0][column], label='0', color='blue', ax=ax[0][1])

In [None]:
# 'TARGET'값에 따른 범주형 컬럼의 분포 시각화 함수

def show_category_by_target(df, columns):
    for column in columns:
        chart = sns.catplot(x=column, col="TARGET", data=df, kind="count")
        chart.set_xticklabels(rotation=65)

In [None]:
# 컬럼 간 상관계수 시각화

ext_data=app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs=ext_data.corr()

plt.figure(figsize=(8,6))

sns.heatmap(ext_data_corrs, cmap=plt.cm.RdYlBu_r, vmin=-0.25, annot=True, vmax=0.6)
plt.title('Correlation Heatmap');

## 이 그래프에서는 target과는 그나마 DAYS_BIRTH가 상관이 있다고 볼 수 있다.

In [None]:
# DAYS_BIRTH 확인

plt.style.use('fivethirtyeight')

# 고객 나이에 대한 히스토그램 분포 확인
plt.hist(app_train['DAYS_BIRTH']/365, edgecolor='k',bins=25)
plt.title('Age of Client');
plt.xlabel('Age (years)');
plt.ylabel('Count');

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(10,8))

# 제때 대출을 상환하는 고객의 나이 plot (TARGET=0)
sns.kdeplot(app_train.loc[app_train['TARGET']==0,'DAYS_BIRTH']/365,label='target==0')

# 제때 대출을 상환하지못하는 고객의 나이 plot (TARGET=1)
sns.kdeplot(app_train.loc[app_train['TARGET']==1,'DAYS_BIRTH']/365,label='target==1')

plt.xlabel('Age(years)');
plt.ylabel('Density');
plt.title('Distribution of Ages');

## 나이가 많을수록 제때 상환할 확률이 높고, 젊을수록 확률이 낮다고 볼 수 있음

In [None]:
# EXT_SOURCE 확인

plt.figure(figsize=(10,12))

for i, source in enumerate(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']):
    plt.subplot(3,1,i+1)
    
    sns.kdeplot(app_train.loc[app_train['TARGET']==0,source],label='target==0')
    sns.kdeplot(app_train.loc[app_train['TARGET']==1,source],label='target==1')
    
    plt.title('Distribution of %s by Target Value' % source)
    plt.xlabel('%s' %source);
    plt.ylabel('Density');
plt.tight_layout(h_pad=2.5)

In [None]:
# 성별에 따른 연체 비율
## 대출 횟수 대비 연체 비율이 남성이 여성보다 높음

con_1 = (app_train['TARGET'] == 1)
con_0 = (app_train['TARGET'] == 0)

print(app_train['CODE_GENDER'].value_counts()/app_train.shape[0])
print('\n연체\n',app_train[con_1]['CODE_GENDER'].value_counts()/app_train[con_1].shape[0])
print('\n연체 X\n',app_train[con_0]['CODE_GENDER'].value_counts()/app_train[con_0].shape[0])

In [None]:
# 연속적 확률 변수를 갖는 중요한 컬럼들

columns = ['AMT_INCOME_TOTAL','AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH',
           'DAYS_REGISTRATION', 'DAYS_LAST_PHONE_CHANGE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'EXT_SOURCE_1', 
           'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 
           'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']

show_hist_by_target(app_train, columns)

#### 데이터 processing

In [None]:
apps.shape

In [None]:
apps['TARGET'].value_counts(dropna = False)

In [None]:
apps.info()

In [None]:
apps.dtypes.index

#### object feature label encoding
- 변수 2개 -> label encoding
- 그 이상 -> one-hot encoding

In [None]:
# object 타입 인덱스 리스트

object_col = apps.dtypes[apps.dtypes == 'object'].index.tolist()
object_col

In [None]:
# 성별 feature를 0과 1로 라벨 인코딩

apps['CODE_GENDER'] = pd.factorize(apps['CODE_GENDER'])[0]
apps['CODE_GENDER']

In [None]:
# for문을 활용해서 모든 object feature 라벨 인코딩 (factorize)

for col in object_col:
    apps[col] = pd.factorize(apps[col])[0]

In [None]:
# dtype == object 인 컬럼들이 사라짐 -> 전부 int64로 변환

apps.info()

#### EDA; previous data

In [None]:
# previous data와 app_train data를 ID를 기준으로 병합

app_prev_target = prev_app.merge(app_train[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')
app_prev_target.shape

In [None]:
# 병합한 데이터와 숫자형 컬럼 분포 확인

num_cols = app_prev_target.dtypes[app_prev_target.dtypes != 'object'].index.tolist()

show_hist_by_target(app_prev_target, num_cols)

In [None]:
# 병합한 데이터와 범주형 칼럼 분포 확인

object_cols = app_prev_target.dtypes[app_prev_target.dtypes=='object'].index.tolist()

show_category_by_target(app_prev_target, object_cols)

## FE; Application

In [None]:
# EXT_SOURCE 스코어

apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].head()

In [None]:
apps['APPS_EXT_SOURCE_MEAN'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
apps['APPS_EXT_SOURCE_STD'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)

In [None]:
apps['APPS_EXT_SOURCE_STD'] = apps['APPS_EXT_SOURCE_STD'].fillna(apps['APPS_EXT_SOURCE_STD'].mean())

apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APPS_EXT_SOURCE_MEAN', 'APPS_EXT_SOURCE_STD']].head(10)

In [None]:
# AMT_CREDIT 비율로 가공 (대출금액)

apps['APPS_ANNUITY_CREDIT_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_CREDIT']
apps['APPS_GOODS_CREDIT_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_CREDIT']
apps['APPS_CREDIT_GOODS_DIFF'] = apps['AMT_CREDIT'] - apps['AMT_GOODS_PRICE']

In [None]:
# AMT_INCOME_TOTAL 비율로 대출 금액 관련 피처 가공

apps['APPS_ANNUITY_INCOME_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_INCOME_TOTAL']
apps['APPS_CREDIT_INCOME_RATIO'] = apps['AMT_CREDIT']/apps['AMT_INCOME_TOTAL']
apps['APPS_GOODS_INCOME_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_INCOME_TOTAL']

# 가족수를 고려한 가처분 소득 피처 가공

apps['APPS_CNT_FAM_INCOME_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['CNT_FAM_MEMBERS']

In [None]:
# DAYS_BIRTH, DAYS_EMPLOYED 비율로 소득/자산 관련 Feature 가공. 

apps['APPS_EMPLOYED_BIRTH_RATIO'] = apps['DAYS_EMPLOYED']/apps['DAYS_BIRTH']
apps['APPS_INCOME_EMPLOYED_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_EMPLOYED']
apps['APPS_INCOME_BIRTH_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_BIRTH']
apps['APPS_CAR_BIRTH_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_BIRTH']
apps['APPS_CAR_EMPLOYED_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_EMPLOYED']

In [None]:
object_columns = apps.dtypes[apps.dtypes=='object'].index.tolist()

for column in object_columns:
    apps[column] = pd.factorize(apps[column])[0]

In [None]:
# 학습 데이터와 테스트 데이터 분리

apps_train = apps[-apps['TARGET'].isnull()]
apps_test = apps[apps['TARGET'].isnull()]
apps.shape, apps_train.shape, apps_test.shape

#### 학습 데이터를 검증 데이터로 분리하고 LGBM Classifier로 학습 수행

In [None]:
ftr_app = apps_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = app_train['TARGET']

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape

In [None]:
clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000,
        learning_rate=0.02,
        num_leaves=32,
        subsample=0.8,
        max_depth=12,
        silent=-1,
        verbose=-1
        )

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
        early_stopping_rounds= 100)

In [None]:
# plot_importance

plot_importance(clf, figsize=(16, 32))

#### 학습된 Classifier를 이용하여 테스트 데이터 예측하고 결과를 Kaggle로 Submit 수행

In [None]:
preds = clf.predict_proba(apps_test.drop(['SK_ID_CURR', 'TARGET'], axis=1))[:, 1 ]

In [None]:
app_test['TARGET'] = preds
app_test[['SK_ID_CURR', 'TARGET']].to_csv('apps_baseline_02.csv', index=False)