# <center>Level 3. Home Credit Default Risk</center>
### <center>- Manual Feature Engineering -</center>
# <center>**주택 대출 미상환 고객 예측**</center>
## <center>제작자 : 서상훈</center>

---

 - 커리큘럼 : https://kaggle-kr.tistory.com/32?category=868318

 - 원문 : https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction
---

지난 노트에서는 application_train / application_test 만 사용했는데
이번 노트에서는 bureau 라는 데이터를 활용해보자

 - bureau : 다른 금융 기관으로부터의 대출 내역

In [None]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')

bureau 데이터를 가지고, agg 연산자를 통해 <br>
mea, max, min, sum, count 값을 활용하자

이후 처리된 데이터를 train에 merge 하자.

In [None]:
# Read in bureau
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau.head()

In [None]:
# Groupby the client id (SK_ID_CURR), count the number of previous loans, and rename the column
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()

In [None]:
# Join to the training dataframe
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
train = train.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')

# Fill the missing values with 0 
train['previous_loan_counts'] = train['previous_loan_counts'].fillna(0)
train.head()

새로운 변수가 유용한지 상관계수로 확인해보자

In [None]:
# kde 그래프를 그리기 위한 함수

# Plots the disribution of a variable colored by value of the target
def kde_target(var_name, df):
    
    # Calculate the correlation coefficient between the new variable and the target
    corr = df['TARGET'].corr(df[var_name])
    
    # Calculate medians for repaid vs not repaid
    avg_repaid = df.ix[df['TARGET'] == 0, var_name].median()
    avg_not_repaid = df.ix[df['TARGET'] == 1, var_name].median()
    
    plt.figure(figsize = (12, 6))
    
    # Plot the distribution for target == 0 and target == 1
    sns.kdeplot(df.ix[df['TARGET'] == 0, var_name], label = 'TARGET == 0')
    sns.kdeplot(df.ix[df['TARGET'] == 1, var_name], label = 'TARGET == 1')
    
    # label the plot
    plt.xlabel(var_name); plt.ylabel('Density'); plt.title('%s Distribution' % var_name)
    plt.legend();
    
    # print out the correlation
    print('The correlation between %s and the TARGET is %0.4f' % (var_name, corr))
    # Print out average values
    print('Median value for loan that was not repaid = %0.4f' % avg_not_repaid)
    print('Median value for loan that was repaid =     %0.4f' % avg_repaid)

In [None]:
kde_target('EXT_SOURCE_3', train)

이제 우리가 새로 만들었던 변수를 kde plot 을 통해 확인해보자

In [None]:
kde_target('previous_loan_counts', train)

상관계수가 매우 약하고 딱히 이 변수가 중요한지 확인하기 어렵다.
bureau 데이터에서 몇 가지 변수를 더 만들어보자.

### Aggregating Numeric Columns

bureau 에서 가지고 온 previous_loan이 별 특별한 것이 없었다.

이제 bureau에서 단순한 count 값이 아닌 mean, max 등 컬럼을 새로 추가해보자

In [None]:
# Group by the client id, calculate aggregation statistics
bureau_agg = bureau.drop(columns = ['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
bureau_agg.head()

각 컬럼 별로 count, mean, max, min, sum 이 추가되서 그려진다.

In [None]:
# List of column names
columns = ['SK_ID_CURR']

# Iterate through the variables names
for var in bureau_agg.columns.levels[0]:
    # Skip the id name
    if var != 'SK_ID_CURR':
        
        # Iterate through the stat names
        for stat in bureau_agg.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('bureau_%s_%s' % (var, stat))

In [None]:
# Assign the list of columns names as the dataframe column names
bureau_agg.columns = columns
bureau_agg.head()

In [None]:
# Train 데이터에 merge 하자
train = train.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
train.head()

## Correlation

In [None]:
# List of new correlations
new_corrs = []

# Iterate through the columns 
for col in columns:
    # Calculate correlation with the target
    corr = train['TARGET'].corr(train[col])
    
    # Append the list as a tuple

    new_corrs.append((col, corr))

In [None]:
# Sort the correlations by the absolute value
# Make sure to reverse to put the largest values at the front of list
new_corrs = sorted(new_corrs, key = lambda x: abs(x[1]), reverse = True)
new_corrs[:15]

상관관계를 보니 새로운 변수들 중 어느것도 TARGET과 유의한
상관관계를 갖지 않는다. 절대적인 측면에서 bureau_DAYS_CREDIT_mean이
제일 높은 상관 변수임으로 kde plot 을 확인해보자

In [None]:
kde_target('bureau_DAYS_CREDIT_mean', train)

자 ! 이제 위의 과정을 통해서 numeric 데이터를
count, mean, max, mun, sum을 추가하는 과정을 살펴보았다.
이제 이 과정을 하나의 함수로 만들어보자

## Function for Numeric Aggregations

In [None]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [None]:
bureau_agg_new = agg_numeric(bureau.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg_new.head()

위의 함수가 제대로 작성되었는지 확인하기 위해 <br>
직접 해봤던 데이터 프레임과 비교해보자

In [None]:
bureau_agg.head()

## Correlation Function
bureau 데이터에 여러 컬럼을 추가한 과정을 함수로 표현한 것처럼
상관관계를 구했던 과정도 마찬가지로 함수로 만들어보자.

In [None]:
# Function to calculate correlations with the target for a dataframe
def target_corrs(df):

    # List of correlations
    corrs = []

    # Iterate through the columns 
    for col in df.columns:
        print(col)
        # Skip the target column
        if col != 'TARGET':
            # Calculate correlation with the target
            corr = df['TARGET'].corr(df[col])

            # Append the list as a tuple
            corrs.append((col, corr))
            
    # Sort by absolute magnitude of correlations
    corrs = sorted(corrs, key = lambda x: abs(x[1]), reverse = True)
    
    return corrs

## Categorical Variables
numeric 데이터를 다루었으니 이제 categorical (범주형) 변수를 다뤄보자

범주형 변수를 다룰 때 자주 사용하는 one-hot-encoding을 해보자

In [None]:
# one-hot-encoding
categorical = pd.get_dummies(bureau.select_dtypes('object'))
categorical['SK_ID_CURR'] = bureau['SK_ID_CURR']
categorical.head()

one-hot-encoding 을 시행한 데이터에서 sum과 mean 데이터만 가져오자

In [None]:
categorical_grouped = categorical.groupby('SK_ID_CURR').agg(['sum', 'mean'])
categorical_grouped.head()

In [None]:
categorical_grouped.columns.levels[0][:10]

In [None]:
categorical_grouped.columns.levels[1]

카테고리 데이터의 level이 2단계로 나누어져 있으므로
다루기 위해서 for문을 활용하여 1단계로 통합하자

In [None]:
group_var = 'SK_ID_CURR'

# Need to create new column names
columns = []

# Iterate through the variables names
for var in categorical_grouped.columns.levels[0]:
    # Skip the grouping variable
    if var != group_var:
        # Iterate through the stat names
        for stat in ['count', 'count_norm']:
            # Make a new column name for the variable and stat
            columns.append('%s_%s' % (var, stat))

#  Rename the columns
categorical_grouped.columns = columns

categorical_grouped.head()

이제 train 데이터에 merge 하자.

In [None]:
train = train.merge(categorical_grouped, left_on = 'SK_ID_CURR', right_index = True, how = 'left')
train.head()


In [None]:
train.shape

In [None]:
train.iloc[:10, 123:]

앞서 각 타입의 변수들을 함수로 만들었듯이,
범주형 변수도 함수로 만들어주자.

In [None]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    return categorical

In [None]:
bureau_counts = count_categorical(bureau, group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_counts.head()

자 지금까지가 bureau.csv 의 데이터를 가지고 다루었으며,
이 과정들을 함수로 묶었다.

이제 bureau_balance.csv 데이터를 다루어보자.

 - bureau_balance : bareau 월별 데이터

In [None]:
# Read in bureau balance
bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
bureau_balance.head()

아까 위에서 만들었던 범주형 변수 함수를 활용해서 <br>
bureau_balance_count 도 return 받자.

In [None]:
# Counts of each type of status for each previous loan
bureau_balance_counts = count_categorical(bureau_balance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_counts.head()

In [None]:
# Calculate value count statistics for each `SK_ID_CURR` 
bureau_balance_agg = agg_numeric(bureau_balance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_agg.head()

In [None]:
# Dataframe grouped by the loan
bureau_by_loan = bureau_balance_agg.merge(bureau_balance_counts, right_index = True, left_on = 'SK_ID_BUREAU', how = 'outer')

# Merge to include the SK_ID_CURR
bureau_by_loan = bureau_by_loan.merge(bureau[['SK_ID_BUREAU', 'SK_ID_CURR']], on = 'SK_ID_BUREAU', how = 'left')

bureau_by_loan.head()

In [None]:
bureau_balance_by_client = agg_numeric(bureau_by_loan.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'client')
bureau_balance_by_client.head()

위의 과정들을 설명하자면,
bureau_balance 데이터를 가지고
1. categorical 데이터를 count, mean 만든다
2. numeric 데이터를 count, mean, median, max, mun 만든다.
3. 2의 데이터를 outer 조인으로 합쳐준다.
4. 이후 bureau[['SK_ID_BUREAU', 'SK_ID_CURR']] 을 기준으로<br>
   left 조인을 해줘서 SK_ID_CURR을 추가해준다.
   
5. 그 다음 이렇게 나온 컬럼 값을 기준으로 numeric 데이터를<br>
   agg_numeric에넣어서 count, mean, min 등의 컬럼을 뽑아내준다.

이 과정들이 복잡하고 난해해 보일 수 있다.
사실 나도 무슨 말인지 이해가 되지 않는다.
그래서 원 노트 제작자도 다시 정리할 겸 진행한다.

## Putting the Functions Together

In [None]:
# Free up memory by deleting old objects
import gc
gc.enable()
del train, bureau, bureau_balance, bureau_agg, bureau_agg_new, bureau_balance_agg, bureau_balance_counts, bureau_by_loan, bureau_balance_by_client, bureau_counts
gc.collect()

In [None]:
# Read in new copies of all the dataframes
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')

In [None]:
bureau_counts = count_categorical(bureau, group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_counts.head()

### Aggregated Stats of Bureau Dataframe

In [None]:
bureau_agg = agg_numeric(bureau.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg.head()

### Value counts of Bureau Balance dataframe by loan

In [None]:
bureau_balance_counts = count_categorical(bureau_balance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_counts.head()

### Aggregated stats of Bureau Balance dataframe by loan

In [None]:
bureau_balance_agg = agg_numeric(bureau_balance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_agg.head()

## Aggregated Stats of Bureau Balance by Client

In [None]:
# Dataframe grouped by the loan
bureau_by_loan = bureau_balance_agg.merge(bureau_balance_counts, right_index = True, left_on = 'SK_ID_BUREAU', how = 'outer')

# Merge to include the SK_ID_CURR
bureau_by_loan = bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(bureau_by_loan, on = 'SK_ID_BUREAU', how = 'left')

# Aggregate the stats for each client
bureau_balance_by_client = agg_numeric(bureau_by_loan.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'client')

## Insert Computed Features into Training Data

위의 데이터들을 하나씩 차례대로 train에 merge 하자.
SK_ID_CURR을 기준으로 합쳐주고 how='left'로 합쳐주자.

In [None]:
original_features = list(train.columns)
print('Original Number of Features: ', len(original_features))

In [None]:
# Merge with the value counts of bureau
train = train.merge(bureau_counts, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of bureau
train = train.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')

# Merge with the monthly information grouped by client
train = train.merge(bureau_balance_by_client, on = 'SK_ID_CURR', how = 'left')

In [None]:
new_features = list(train.columns)
print('Number of features using previous loans from other institutions data: ', len(new_features))


## Feature Engineering Outcomes

### Missing Values

In [None]:

# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
missing_train = missing_values_table(train)
missing_train.head(10)

결측값이 높은 feature가 많다.<br>
여기서는 90% 이상의 결측값을 갖는 Test 데이터의 feature를 제거하자.

In [None]:
missing_train_vars = list(missing_train.index[missing_train['% of Total Values'] > 90])
len(missing_train_vars)

## Calculate Information for Testing Data


In [None]:

# Read in the test dataframe
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

# Merge with the value counts of bureau
test = test.merge(bureau_counts, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of bureau
test = test.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')

# Merge with the value counts of bureau balance
test = test.merge(bureau_balance_by_client, on = 'SK_ID_CURR', how = 'left')

In [None]:
print('Shape of Testing Data: ', test.shape)

Train, Test 데이터를 정렬하자.

In [None]:
train_labels = train['TARGET']

# Align the dataframes, this will remove the 'TARGET' column
train, test = train.align(test, join = 'inner', axis = 1)

train['TARGET'] = train_labels

In [None]:
print('Training Data Shape: ', train.shape)
print('Testing Data Shape: ', test.shape)

이제 두 데이터 프레임은 TARGET 을 제외하고 동일한 열을 갖는다.<br>
이는 머신러닝 모델을 사용할 수 있음을 의미한다.

이제 Test 데이터에서 결측값을 살펴보고<br>
삭제할 feature가 있는지 확인해보자

In [None]:
missing_test = missing_values_table(test)
missing_test.head(10)

In [None]:
missing_test_vars = list(missing_test.index[missing_test['% of Total Values'] > 90])
len(missing_test_vars)

In [None]:
missing_columns = list(set(missing_test_vars + missing_train_vars))
print('There are %d columns with more than 90%% missing in either the training or testing data.' % len(missing_columns))

In [None]:
# Drop the missing columns
train = train.drop(columns = missing_columns)
test = test.drop(columns = missing_columns)

90% 이상의 결측값을 가진 feature가 없기 때문에, <br>
위의 과정에서 feature의 수를 줄이지 못했다.

이에 따라 다른 feature selection 방법을 활용하자.

In [None]:
# # 저장한번 하기
# train.to_csv('train_bureau_raw.csv', index = False)
# test.to_csv('test_bureau_raw.csv', index = False)

## Correlations

먼저 변수와 TARGET 간의 상관관계를 살펴보자.

In [None]:
corrs = train.corr()

corrs = corrs.sort_values('TARGET', ascending = False)

# Ten most positive correlations
pd.DataFrame(corrs['TARGET'].head(10))

In [None]:
# Ten most negative correlations
pd.DataFrame(corrs['TARGET'].dropna().tail(10))

목표와 가장 높은 상관 계수를 갖는 변수는 우리가 만들었던 변수다.
좀 더 자세히 파악하기 위해 아까 만들었던 kde plot 함수를 활용해보자

In [None]:
kde_target(var_name='client_bureau_balance_counts_mean', df=train)

In [None]:
kde_target(var_name='bureau_CREDIT_ACTIVE_Active_count_norm', df=train)

여기서 상관계수가 너무 높은 부분을 제거하자.
threshold 를 0.8로 잡고 그 이상이 되는 값들은 제거하자.

In [None]:
# Set the threshold
threshold = 0.8

# Empty dictionary to hold correlated variables
above_threshold_vars = {}

# For each column, record the variables that are above the threshold
for col in corrs:
    above_threshold_vars[col] = list(corrs.index[corrs[col] > threshold])

In [None]:

# Track columns to remove and columns already examined
cols_to_remove = []
cols_seen = []
cols_to_remove_pair = []

# Iterate through columns and correlated columns
for key, value in above_threshold_vars.items():
    # Keep track of columns already examined
    cols_seen.append(key)
    for x in value:
        if x == key:
            next
        else:
            # Only want to remove one in a pair
            if x not in cols_seen:
                cols_to_remove.append(x)
                cols_to_remove_pair.append(key)
            
cols_to_remove = list(set(cols_to_remove))
print('Number of columns to remove: ', len(cols_to_remove))

먼저 dict 의 형태로 key, value 값으로 corr이 높은 얘들을 넣어주자.
key는 컬럼명이고 value는 해당 컬럼과 corr 값이 0.8 이상인 얘들의 값이다.

여기서 같은 컬럼끼리는 상관계수가 1이므로 같은 컬럼 값이 들어간다.
이 값들은 빼주자

In [None]:
train_corrs_removed = train.drop(columns = cols_to_remove)
test_corrs_removed = test.drop(columns = cols_to_remove)

print('Training Corrs Removed Shape: ', train_corrs_removed.shape)
print('Testing Corrs Removed Shape: ', test_corrs_removed.shape)

In [None]:
train_corrs_removed.to_csv('train_bureau_corrs_removed.csv', index = False)
test_corrs_removed.to_csv('test_bureau_corrs_removed.csv', index = False)

## Modeling

In [None]:
import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

import gc

import matplotlib.pyplot as plt

In [None]:
def model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    # feature랑 test_features 는 dataframe 자체가 들어온다.
    # 위에서 진행한 train_corrs_removed 등이 들어온다.
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    # pk 값들을 뺀다
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    # TARGET 을 뺀다
    labels = features['TARGET']
    
    # Remove the ids and target
    # 컬럼 제거
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    # 인코딩이 ohe 일 때는 원-핫 인코딩을 진행한다.
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        # 특성만큼 돌리면서
        for i, col in enumerate(features):
            # 범주형이면
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                # label encode 를 진행
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    # KFold 만든다
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for feature importances
    # 특성 중요도 배열
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    # Test 예측했을 때 넣을 배열
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    # validation 할 때 나오는 예측 값들 넣는 배열
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        # 모델을 만든다
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        # 훈련을 시킨다
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        # 한번 kfold 돌고 모델을 훈련시킨 뒤 최고 성적을 가져온다
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        # 한번 kfold 돌고 모델을 훈련시킨 뒤 그 훈련할 때 중요하게
        # 본 특성을 가져와서 feature_importance_values에 넣어준다
        # 근데 kfold 만큼 도니까 나눠준다
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        # 예측을 넣어준다
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        # validation 에 대한 값을 넣어준다
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        # 최고 좋은 성적을 뽑아낸다
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    # 여기에서 test 에 대한 예측값을 넣어준다
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    # 여기에서 특성 중요도를 넣어준다
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    # valid 에 대한 roc_auc_socre 뽑아낸다
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    # valid 와 train의 score를 넣어주고
    # valid_scores 는 위에서 for 문에서 넣어줬던 배열이다
    # train scores 도 마찬가지.
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [None]:
def plot_feature_importances(df):
    """
    Plot importances returned by a model. This can work with any measure of
    feature importance provided that higher importance is better. 
    
    Args:
        df (dataframe): feature importances. Must have the features in a column
        called `features` and the importances in a column called `importance
        
    Returns:
        shows a plot of the 15 most importance features
        
        df (dataframe): feature importances sorted by importance (highest to lowest) 
        with a column for normalized importance
        """
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    return df

## Control

먼저 테스트로 오리지널 데이터로 단순히 훈련을 하고 예측값을 뽑아내자

In [None]:
train_control = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
test_control = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

In [None]:
submission, fi, metrics = model(train_control, test_control)

In [None]:
metrics

In [None]:
fi_sorted = plot_feature_importances(fi)

In [None]:
submission.to_csv('control.csv', index = False)

제출을 해보니 0.745의 정확도가 나왔다.

## Test One

이제 우리가 여태껏 만들었던 데이터셋을 이용해서 훈련을 해보자
먼저 아까 corr 관련해서 drop을 시키지 않은 데이터로 해보자

In [None]:
submission_raw, fi_raw, metrics_raw = model(train, test)

In [None]:
metrics_raw

In [None]:
fi_raw_sorted = plot_feature_importances(fi_raw)

In [None]:
top_100 = list(fi_raw_sorted['feature'])[:100]
new_features = [x for x in top_100 if x not in list(fi['feature'])]

print('%% of Top 100 Features created from the bureau data = %d.00' % len(new_features))

In [None]:
submission_raw.to_csv('test_one.csv', index = False)

제출을 해보니 0.759의 정확도가 나왔다.

## Test Two
corr 관련해서 drop 시킨 것으로 진행해보자

In [None]:
submission_corrs, fi_corrs, metrics_corr = model(train_corrs_removed, test_corrs_removed)

In [None]:
metrics_corr

In [None]:
fi_corrs_sorted = plot_feature_importances(fi_corrs)

In [None]:
submission_corrs.to_csv('test_two.csv', index = False)

제출을 해보니 0.753의 정확도가 나왔다.