**2021-1 BIG DATA ANALYSIS (001) Final Project**

1711317 Jeon Hyeon-ju

**previous_application** data EDA, featuring engineering and model generation

In [None]:
import numpy as np
import pandas as pd
import gc
import time

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

In [None]:
#사용할 csv 파일 불러오기
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

#두 데이터를 함께 가공하기 위해 통합
app = pd.concat([app_train, app_test])

#shape 확인
print('Shape of app_train', app_train.shape)
print('Shape of app_test', app_test.shape)
print('Shape of app', app.shape)

app_train의 column 개수가 1개 더 많다

In [None]:
#Previous_applicaion csv 파일 불러오기
prev_app = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')
print(prev_app.shape)
prev_app.head()

In [None]:
#appication & previous merge
app_prev_outer = prev_app.merge(app['SK_ID_CURR'], on='SK_ID_CURR', how='outer', indicator=True)
app_prev_outer.head()

In [None]:
app_prev_outer['_merge'].value_counts()

In [None]:
#null 값 확인 함수
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

missing_data(prev_app).head(10)

In [None]:
# Target에 다른 Feature 확인을 위해 prev_app와 app_train을 inner 조인
app_prev_target = prev_app.merge(app_train[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')
app_prev_target.shape

In [None]:
# Target에 따른 시각화 함수

def show_hist_by_target(df, columns):
    cond_1 = (df['TARGET'] == 1)
    cond_0 = (df['TARGET'] == 0)
    
    for column in columns:
        fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), squeeze=False)
        sns.violinplot(x='TARGET', y=column, data=df, ax=axs[0][0] )
        sns.distplot(df[cond_0][column], ax=axs[0][1], label='0', color='Green')
        sns.distplot(df[cond_1][column], ax=axs[0][1], label='1', color='red') 

In [None]:
# category에 따른 시각화 함수
def show_category_by_target(df, columns):
    for column in columns:
        chart = sns.catplot(x=column, col="TARGET", data=df, kind="count")
        chart.set_xticklabels(rotation=45)

In [None]:
# Numerical Features & Categorical Features 구분 함수

def type_features(data):
    categorical_features = data.select_dtypes(include = ["object"]).columns
    numerical_features = data.select_dtypes(exclude = ["object"]).columns
    print( "categorical_features :",categorical_features)
    print('-'*100)
    print("numerical_features:",numerical_features)

In [None]:
type_features(app_prev_target)

In [None]:
# 시각화 함수
def plot_re(df,t1='',t2=''):
    f,ax=plt.subplots(1,2,figsize=(12,8))
    df[[t1,t2]].groupby([t1]).count().plot.bar(ax=ax[0],color='Green')
    ax[0].set_title('count on '+t1)
    sns.countplot(t1,hue=t2,data=df,ax=ax[1],palette="summer")
    ax[1].set_title(t1+': Target 0 vs Target 1')
    plt.xticks(rotation=90)
    a=plt.show()
    return a

**Application_train/test EDA**

In [None]:
app_train['TARGET'].value_counts()/app_train.shape[0]

target 값의 비율 차가 큼 => 불규칙 데이터임

In [None]:
app_train['TARGET'].astype(int).plot.hist()

In [None]:
#Target 값에 따른 소득 확인
app_train['AMT_INCOME_TOTAL'].hist()

In [None]:
# 고객 나이 분포 확인
plt.hist(app_train['DAYS_BIRTH']/365, edgecolor='k',bins=25)
plt.title('Age of Client');
plt.xlabel('Age (years)');
plt.ylabel('Count');

In [None]:
plt.figure(figsize=(10,8))

# 정상 상환하는 고객의 나이 (TARGET=0)
sns.kdeplot(app_train.loc[app_train['TARGET']==0,'DAYS_BIRTH']/365,label='target==0')

# 정상 상환하지못하는 고객의 나이 (TARGET=1)
sns.kdeplot(app_train.loc[app_train['TARGET']==1,'DAYS_BIRTH']/365,label='target==1')

plt.xlabel('Age(years)');
plt.ylabel('Density');
plt.title('Distribution of Ages');

40대 이전에는 제 때에 상환하지 못할 확률이 큼(red line), 특히 30대에는 제 때에 상환하지 못할 확률이 매우 큼.
40대 이후로 제 때에 상환할 확률이 커짐(blue line)

In [None]:
#성별 별 연체 비율
cond_1 = (app_train['TARGET'] == 1)
cond_0 = (app_train['TARGET'] == 0)

print(app_train['CODE_GENDER'].value_counts()/app_train.shape[0])
print('\n연체인 경우\n',app_train[cond_1]['CODE_GENDER'].value_counts()/app_train[cond_1].shape[0])
print('\n연체가 아닌 경우\n',app_train[cond_0]['CODE_GENDER'].value_counts()/app_train[cond_0].shape[0])

대출 횟수 대비 연체 비율은 남성이 여성보다 높음

**previous_application EDA**

In [None]:
#대출 유형에 따른 target
plot_re(app_prev_target, 'NAME_CONTRACT_TYPE','TARGET')

Cash loans과 Consumer loans의 수가 확연히 큼.
target0은 Consumer > Cash > Revolving loans 순이지만
target1은 Cash > Consumer > Revolving loans 순임

In [None]:
#동행 고객에 따른 target
plot_re(app_prev_target, 'NAME_TYPE_SUITE','TARGET')

Unaccompanied > Family > Spouse, partner > Children > etc. 순이고, Unaccompanied와 Family가 매우 큰 비율을 차지함.
전체적으로 Target0이 훨씬 많음.

In [None]:
sns.boxplot(prev_app.groupby('SK_ID_CURR')['SK_ID_CURR'].count())

In [None]:
# 숫자형 컬럼만 추출
num_columns = app_prev_target.dtypes[app_prev_target.dtypes != 'object'].index.tolist()
num_columns

In [None]:
show_hist_by_target(app_prev_target, num_columns)

In [None]:
# TARGET 유형별 Category 히스토그램 비교
object_columns = app_prev_target.dtypes[app_prev_target.dtypes=='object'].index.tolist()
object_columns

show_category_by_target(app_prev_target, object_columns)

**Feature Engineering**

In [None]:
def get_app_processed(app):
    app['ANNUITY_CREDIT_RATIO'] = app['AMT_ANNUITY']/app['AMT_CREDIT'] #대출금액 대비 월대출지급액
    app['ANNUITY_INCOME_RATIO'] = app['AMT_ANNUITY']/app['AMT_INCOME_TOTAL'] #소득 대비 월대출지급액
    app['CREDIT_INCOME_RATIO'] = app['AMT_CREDIT']/app['AMT_INCOME_TOTAL'] #소득 대비 대출금액
    app['CNT_FAM_INCOME_RATIO'] = app['AMT_INCOME_TOTAL']/app['CNT_FAM_MEMBERS'] #가족 수 대비 소득
    app['EMPLOYED_BIRTH_RATIO'] = app['DAYS_EMPLOYED']/app['DAYS_BIRTH'] #고객 나이대비 직업유지기간
    app['INCOME_EMPLOYED_RATIO'] = app['AMT_INCOME_TOTAL']/app['DAYS_EMPLOYED'] #고객 직업유지기간 대비 소득 
    app['INCOME_BIRTH_RATIO'] = app['AMT_INCOME_TOTAL']/app['DAYS_BIRTH'] #고객 나이 대비 소득
    app['CAR_BIRTH_RATIO'] = app['OWN_CAR_AGE'] / app['DAYS_BIRTH'] #고객 나이 대비 소유차량 연식
    app['CAR_EMPLOYED_RATIO'] = app['OWN_CAR_AGE'] / app['DAYS_EMPLOYED'] #고객 직업유지기간 대비 소유차량 연식
    
    #EXT 통합 및 결측치 채우기
    app['EXT_SOURCE_MEAN'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    app['EXT_SOURCE_STD'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    app['EXT_SOURCE_STD'] = app['EXT_SOURCE_STD'].fillna(app['EXT_SOURCE_STD'].mean())
    return app

In [None]:
def get_prev_processed(prev):
    
    prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
    prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
    prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT']/ prev['AMT_APPLICATION']
    prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE'] / prev['AMT_APPLICATION']   
    
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    
    prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
    
    # 총납부 금액
    all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
    # 이자율
    prev['PREV_INTERESTS_RATE'] = (all_pay/prev['AMT_CREDIT'] - 1)/prev['CNT_PAYMENT']
        
    return prev

In [None]:
def get_prev_amt_agg(prev):
    agg_dict = {
        'SK_ID_CURR':['count'],
        'AMT_CREDIT':['mean', 'max', 'sum'],
        'AMT_ANNUITY':['mean', 'max', 'sum'], 
        'AMT_APPLICATION':['mean', 'max', 'sum'],
        'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
        'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],

        'PREV_CREDIT_DIFF': ['mean', 'max', 'sum'],
        'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
        'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
        'PREV_GOODS_APPL_RATIO':['mean', 'max'],
        'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
        'PREV_INTERESTS_RATE':['mean', 'max']
    }
    prev_group = prev.groupby('SK_ID_CURR')
    prev_amt_agg = prev_group.agg(agg_dict)
    
    # 인덱스명 변경
    prev_amt_agg.columns = ["PREV_"+ "_".join(x).upper() for x in prev_amt_agg.columns.ravel()]
    return prev_amt_agg

In [None]:
# previous application groupby + aggregation
def get_prev_refused_appr_agg(prev):
    
    prev_refused_appr_group = prev[prev['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].groupby(['SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
    prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()
  
    prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT']
    
    # NaN값 0으로 채움
    prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)    
    return prev_refused_appr_agg

In [None]:
def get_prev_agg(prev):
    '''
    Aggregation for previous credit
    '''
    prev = get_prev_processed(prev)
    prev = get_prev_processed(prev)
    prev_amt_agg = get_prev_amt_agg(prev)
    
    # Refused or Approved previous credit
    prev_refused_appr_agg = get_prev_refused_appr_agg(prev)
    
    prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on='SK_ID_CURR', how='left')
    
    # SK_ID_CURR별 과거 대출건수 대비 APPROVED_COUNT 및 REFUSED_COUNT 비율 
    prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
    prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
    
    # 'PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT' 컬럼 삭제
    prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis = 1)
    
    return prev_agg

In [None]:
#factorize를 위한 encoding
def get_apps_all_encoded(apps_all):
    object_columns = apps_all.dtypes[apps_all.dtypes == 'object'].index.to_list()
    for column in object_columns:
        apps_all[column] = pd.factorize(apps_all[column])[0]
    return apps_all

In [None]:
#train, test 데이터를 다시 분리
def get_apps_all_train_test(apps_all):
    apps_all_train = apps_all[~apps_all['TARGET'].isnull()]
    apps_all_test = apps_all[apps_all['TARGET'].isnull()]
    apps_all_test = apps_all_test.drop('TARGET', axis = 1)
    return apps_all_train, apps_all_test

In [None]:
# 모든 데이터 피쳐 엔지니어링 및 결합
def get_apps_all_with_all_agg(app, prev_app,):

    apps_all =  get_app_processed(app)
    print("Final Application:", apps_all)
    prev_agg = get_prev_agg(prev_app)
    print("Final Previous:", prev_agg)
    
    # 생성된 데이터프레임을 모두 조인하여 최종 학습/테스트 집합 생성
    apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')

    print('Final DataSet:', apps_all.shape)
    
    return apps_all

**Model**

In [None]:
# application, previous 관련 데이터셋 가공 및 취합. 
apps_all = get_apps_all_with_all_agg(app, prev_app)

# Category 컬럼 Label 인코딩 수행. 
apps_all = get_apps_all_encoded(apps_all)

# train과 test 데이터 분리. 
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)

In [None]:
print("Train Data", apps_all_train.shape)
print("Test Data", apps_all_test.shape)

In [None]:
ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1) # feature dateset
target_app = apps_all_train['TARGET'] # target datasets

test_preds = np.zeros(apps_all_test.shape[0])

In [None]:
from sklearn.model_selection import train_test_split
train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape

In [None]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier(
                nthread=4,
                n_estimators=4000,
                learning_rate=0.01,
                num_leaves=58,
                max_depth = 11,
                max_bin=407,
                colsample_bytree=0.613,
                subsample=0.708,
                reg_alpha=3.564,
                reg_lambda=4.930,
                min_child_weight= 6,
                min_child_samples=165,
                silent=-1,
                verbose=-1,
                )

In [None]:
clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
       eval_metric='auc', verbose=100, early_stopping_rounds=50)

In [None]:
#Feature importance 시각화
from lightgbm import plot_importance
plot_importance(clf, figsize=(16, 32))

In [None]:
clf.predict_proba(app_test.drop['SK_ID_CURR'], axis=1)

In [None]:
preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis=1))[:, 1 ]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('prev_hj.csv', index=False)