In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# 라이브러리
import numpy as np
import pandas as pd
import gc
import time

%matplotlib inline

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

### application-train 과  application_test 데이터 read

In [None]:
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

In [None]:
app_train.columns.values

In [None]:
app_train['TARGET'].value_counts()

#### 타겟 값이 0 : 대출금 상환 가능, 1 : 대출금 상환 어려움

In [None]:
app_train['TARGET'].astype(int).plot.hist(); #TARGET 값(0과 1)에 따른 히스토그램 시각화

##### -> 1보다 0이 훨씬 많은 imbalanced data

### Data Exploration

AMT_CREDIT

In [None]:
sns.distplot(app_train['AMT_CREDIT'])

AMT_INCOME_TOTAL

In [None]:
sns.boxplot(app_train['AMT_INCOME_TOTAL'])

##### Target 값에 따른 AMT_INCOME_TOTAL 값 분포도 비교

In [None]:
# x는 비교하고자 하는 타겟값, y는 분포들을 보려는 칼럼 소득값
cond_amt = app_train['AMT_INCOME_TOTAL'] < 1000000
sns.violinplot(x='TARGET', y='AMT_INCOME_TOTAL', data=app_train[cond_amt])

AMT_GOODS_PRICE

In [None]:
sns.distplot(app_train['AMT_GOODS_PRICE'])

### checking missing data in application_train

In [None]:
# checking missing data
total = app_train.isnull().sum().sort_values(ascending = False)
percent = (app_train.isnull().sum()/app_train.isnull().count()*100).sort_values(ascending = False)
missing_app_train  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_app_train.head(20)

### Application_train.csv + Application_test.csv 결합

In [None]:
app_train.shape, app_test.shape

In [None]:
# app_train 과 app_test를 함께 가공을 해야하기 때문에 concat 이용하여 통합
apps=pd.concat([app_train, app_test])
apps.shape

In [None]:
apps['TARGET'].value_counts(dropna=False)

### Label Encodig Object feature

In [None]:
apps.info()

In [None]:
apps.dtypes.index

In [None]:
object_columns = apps.dtypes[apps.dtypes == 'object'].index.tolist()

for col in object_columns :
    apps[col] = pd.factorize(apps[col])[0]

In [None]:
apps.info()

#### apps 분리 (train + test)

In [None]:
apps=apps.fillna(-999)
app_train = apps[apps['TARGET']!=-999]
app_test = apps[apps['TARGET']==-999]
app_train.shape, app_test.shape

In [None]:
app_test.drop('TARGET', axis=1, inplace=True)
app_test.shape

### Feature Importance using LGBM Classifier

In [None]:
ftr_app = app_train.drop(['SK_ID_CURR','TARGET'], axis=1)
target_app = app_train['TARGET']

In [None]:
from sklearn.model_selection import train_test_split

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape

In [None]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier(
    n_jobs=-1,
    n_estimators=1000,
    learning_rate=0.02,
    subsample=0.8,
    max_depth=12,
    silent=-1,
    verbose=-1
)

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=50)

In [None]:
from lightgbm import plot_importance

plot_importance(clf, figsize=(16,32))

### Correlations of Features

In [None]:
# feature importance 결과 상위 4개 feature에 대한 correlation; EXT_SOURCE1, EXT_SOURCE2, EXT_SOURCE3, DAYS_BIRTH
ext_data=app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs=ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize=(8,6))

sns.heatmap(ext_data_corrs, cmap=plt.cm.RdYlBu_r, vmin=-0.25, annot=True, vmax=0.6)
plt.title('Correlation Heatmap')

#### Using Breau data

In [None]:
apps = pd.concat([app_train, app_test])

In [None]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau_bal = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
print("The number of features is :",bureau.shape[1],"The number of row is:",bureau.shape[0])

In [None]:
# day_credit에서 -497 과 -208 값은 같은 ID 215354에 해당
# 215354 ID 고객의 이전 신용 거래 확인
bureau[bureau['SK_ID_CURR'] == 215354]

In [None]:
# TARGET 값을 가져오기 위해 bureau를 apps와 조인
app_bureau = bureau.merge(app_train[['SK_ID_CURR', 'TARGET']], left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='inner')
app_bureau.shape

In [None]:
f,ax=plt.subplots(1,2,figsize=(12,6))
app_train.TARGET.value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Distribution of Target')
ax[0].set_ylabel('')
sns.countplot('TARGET',data=app_train,ax=ax[1])
ax[1].set_title('Target count')
plt.show()

#### Null 값 확인

In [None]:
# Null 값 확인 가능한 함수 (비율)
def nulldata(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    ms=pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    ms= ms[ms["Percent"] > 0]
    f,ax =plt.subplots(figsize=(15,10))
    plt.xticks(rotation='90')
    fig=sns.barplot(ms.index, ms["Percent"],color="green",alpha=0.8)
    plt.xlabel('Features', fontsize=15)
    plt.ylabel('Percent of null values', fontsize=15)
    plt.title('Percent null data by feature', fontsize=15)
    return ms

# bureau 데이터만 확인
nulldata(bureau)

### Missing value 처리

In [None]:
apps.head()

In [None]:
# 수치형 데이터는 중앙값
def miss_numerical(df):
    
    features = ['previous_loan_counts','NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAPARTMENTS_AVG','NONLIVINGAREA_MEDI','OWN_CAR_AGE']
    numerical_features = apps.select_dtypes(exclude = ["object"] ).columns
    for f in numerical_features:
        if f not in features:
            df[f] = df[f].fillna(df[f].median())
      
    return df
# 카테고리 데이터는 최빈값
def miss_categorical(df):
    
    categorical_features = apps.select_dtypes(include = ["object"]).columns
    
    for f in categorical_features:
        df[f] = df[f].fillna(df[f].mode()[0])
        
    return df
def transform_feature(df):
    df = miss_numerical(df)
    df = miss_categorical(df)
    return df
apps = transform_feature(apps)
apps.head()

#### LGBM classifier로 학습

In [None]:
ftr_app = apps.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = apps['TARGET']

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape

In [None]:
clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000,
        learning_rate=0.02,
        num_leaves=32,
        subsample=0.8,
        max_depth=12,
        silent=-1,
        verbose=-1
        )

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
        early_stopping_rounds= 50)

이 전의 결과보다 auc 상승

In [None]:
plot_importance(clf, figsize=(16, 32))

In [None]:
preds = clf.predict_proba(apps.drop(['SK_ID_CURR','TARGET'], axis=1))[:, 1 ]
apps['TARGET'] = preds
apps[['SK_ID_CURR', 'TARGET']].to_csv('prev_baseline.csv', index=False)