In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## 목표: 대출을 받은 고객이 상환능력이 있는지 없는지를 분류하는 에측 모델을 만드는 것

### Imports

#### We are using a typical data science stack: numpy, pandas, sklearn, matplotlib

In [None]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system management
import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

### Read in Data

In [None]:
# List files available
print(os.listdir("../input/"))

In [None]:
# Training data
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

In [None]:
# Testing data
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
print('Testing data shape: ', app_test.shape)
app_test.head()

In [None]:
## The test set is considerably smaller and lacks a Target column

### Exploratory Data Analysis

#### Examine the Distribution of the Target Column

In [None]:
app_train['TARGET'].value_counts()

In [None]:
app_train['TARGET'].astype(int).plot.hist();

In [None]:
### 이 정보에서 우리는 이것이 불균형 클래스 문제임을 알 수 있습니다.
### 갚지 않은 대출보다 제때 갚은 대출이 훨씬 더 많다. 
### 좀 더 정교한 기계 학습 모델에 들어가면 이러한 불균형을 반영하기 위해 데이터의 표현에 따라 클래스에 가중치를 부여할 수 있습니다.

In [None]:
# Function to calculate missing values by column Funct

def missing_values_table(df):
    # Total missing values
    mis_val = df.isnull().sum()
    
    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum()/len(df)
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    
    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0: 'Missing Values', 1: '% of Total Values'})
    
    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1]!=0].sort_values(
    '% of Total Values', ascending=False).round(1)
    
    #Print some summary information
    print("Your selected dataframe has" + str(df.shape[1])+"columns.\n"
         "There are" + str(mis_val_table_ren_columns.shape[0])+
         "columns that have missing values.")
    
    # Return the dataframe with missing information
    return mis_val_table_ren_columns
    

In [None]:
# Missing values statistics
missing_values = missing_values_table(app_train)
missing_values.head(20)

In [None]:
### 기계 학습 모델을 구축할 때가 되면 이러한 결측값을 채워야 합니다
### 이후 작업에서 우리는 대치할 필요 없이 결측값을 처리할 수 있는 XGBoost와 같은 모델을 사용할 것입니다. 
### 또 다른 옵션은 누락된 값의 비율이 높은 열을 삭제하는 것이지만 이러한 열이 우리 모델에 도움이 될지 미리 알 수는 없습니다. 따라서 지금은 모든 열을 유지합니다.

### Column Types

In [None]:
## Number of each type of column
app_train.dtypes.value_counts()

In [None]:
# Number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)

In [None]:
### Encoding Categorical Variables

# 1.Label encoding: 범주형 변수의 각 고유 범주를 정수로 할당합니다. 새 열이 생성되지 않습니다.

# 2. One-hot encoding: 범주형 변수의 각 고유 범주에 대해 새 열을 만듭니다. 각 관찰은 해당 범주에 대해 열에 1을 받고 다른 모든 새 열에 0을 받습니다.

## 레이블 인코딩의 문제는 범주에 임의의 순서를 부여한다는 것입니다. 
## 각 범주에 할당된 값은 무작위이며 범주의 고유한 측면을 반영하지 않습니다.
## 따라서 레이블 인코딩을 수행할 때 모델은 기능의 상대 값(예: 프로그래머 = 4 및 데이터 과학자 = 1)을 사용하여 우리가 원하는 것이 아닌 가중치를 할당할 수 있습니다. 
## 범주형 변수(예: 남성/여성)에 대해 고유한 값이 두 개뿐인 경우 레이블 인코딩은 괜찮지만 고유 범주가 2개 이상인 경우 원 핫 인코딩이 안전한 옵션입니다.
## 클래스가 많은 범주형 변수의 경우 원-핫 인코딩이 범주에 임의의 값을 부과하지 않기 때문에 가장 안전한 접근 방식이라고 생각합니다. 
## 원-핫 인코딩의 유일한 단점은 기능의 수(데이터 차원)가 많은 범주의 범주형 변수로 폭발할 수 있다는 것입니다. 
## 이를 처리하기 위해 원-핫 인코딩을 수행한 후 PCA 또는 기타 차원 축소 방법을 수행하여 차원 수를 줄일 수 있습니다.

In [None]:
## Label Encoding and One-Hot Encoding

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # IF 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            #Keep track of how many columns were label encoded
            le_count += 1
            
        print('%d columns were label encoded.'%le_count)

In [None]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Testing Feature shape: ', app_test.shape)

In [None]:
# Aligning Training and Testing Data

train_labels = app_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join='inner', axis=1)
## 두 객체 둘 다 있는 인덱스를 교차

# Add the target back in
app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Feature shape: ', app_test.shape)

### Back to Exploratory Data Analysis

In [None]:
# DAYS_BIRTH 열의 숫자는 현재 대출 신청을 기준으로 기록되기 때문에 음수입니다. 이러한 통계를 연도 단위로 보려면 -1을 곱하고 1년의 일수로 나눌 수 있습니다.
# 이상치는 없는 것 같

(app_train['DAYS_BIRTH']/-365).describe()

In [None]:
app_train['DAYS_EMPLOYED'].describe()

In [None]:
app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram');
plt.xlabel('Days Employment');

In [None]:
anom = app_train[app_train['DAYS_EMPLOYED']==365243]
non_anom = app_train[app_train['DAYS_EMPLOYED']!=365243]
print('The non-anomalies default on %0.2f%% of loans' % (100*non_anom['TARGET'].mean()))
print('The anomalies default on %0.2f%% of loans' % (100*anom['TARGET'].mean()))
print('There are %d anomalous days of employment'% len(anom))

In [None]:
### 예외 처리는 정해진 규칙 없이 정확한 상황에 따라 다릅니다. 
### 가장 안전한 접근 방식 중 하나는 예외를 결측값으로 설정한 다음 기계 학습 전에 (대치를 사용하여) 채우는 것입니다. 
### 이 경우 모든 변칙의 값이 정확히 같으므로 이러한 모든 대출이 공통점을 공유할 경우를 대비하여 동일한 값으로 채우고자 합니다. 
### 비정상적인 값은 어느 정도 중요한 것 같으므로 실제로 이 값을 채웠는지 머신 러닝 모델에 알리고 싶습니다. 
### 해결책으로 비정상적인 값을 숫자(np.nan)가 아닌 값으로 채운 다음 값이 비정상적인지 여부를 나타내는 새 부울 열을 만듭니다.

In [None]:
# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243

# Replace the anomalous values with nan
app_train['DAYS_EMPLOYED'].replace({365243:np.nan}, inplace=True)

app_train['DAYS_EMPLOYED'].plot.hist(title='Days Employment Histogram');
plt.xlabel('Days Employment');

In [None]:
## 훈련 데이터에 대해 수행하는 모든 작업은 테스트 데이터에도 수행해야 합니다. 
## 테스트 데이터에서 새 열을 만들고 기존 열을 np.nan으로 채우도록 합시다.

In [None]:
app_test['DAYS_EMPLOYED_ANOM'] = app_test['DAYS_EMPLOYED']==365243
app_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

print('There are %d anomalies in the test data out of %d entries' % (app_test['DAYS_EMPLOYED_ANOM'].sum(), len(app_test)))

In [None]:
# Find correlations with the target and sort
correlations = app_train.corr()['TARGET'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

In [None]:
### 설명서를 보면 DAYS_BIRTH는 대출 당시 클라이언트의 나이로 음수일(이유가 무엇이든!)입니다. 
### 상관 관계는 양수이지만 이 기능의 값은 실제로 음수입니다. 
### 즉, 고객이 나이가 들수록 대출 불이행 가능성이 줄어듭니다(즉, 목표 == 0). 
### 이것은 약간 혼란스럽기 때문에 특성의 절대값을 취하면 상관 관계가 음수가 됩니다.

## Effect of Age on Repayment

In [None]:
# Find the correlation of the positive days since birth and target
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])

In [None]:
# Set the style of plots
plt.style.use('fivethirtyeight')

# Plot the distribution of ages in years
plt.hist(app_train['DAYS_BIRTH']/365, edgecolor = 'k', bins=25)
plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count');

In [None]:
plt.figure(figsize = (10,8))

# KDE plot of loans that were repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET']==0, 'DAYS_BIRTH']/365, label='target==0')

# KDE plot of loans which were not repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET']==1, 'DAYS_BIRTH']/365, label='target==1')

# Labeling of plot
plt.xlabel('Age(years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');

In [None]:
# Age information into a separate dataframe
age_data = app_train[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH']/365

# Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins=np.linspace(20,70,num=11))
age_data.head(10)

In [None]:
# Group by the bin and calculate averages
age_groups = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize=(8,8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str), 100*age_groups['TARGET'])

# plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay(%)')
plt.title('Failure to Repay by Age Group');

In [None]:
### 분명한 추세가 있습니다.
### 젊은 지원자는 대출을 상환하지 않을 가능성이 더 높습니다
### 연체율은 최연소 3세 10% 이상, 고령 5% 미만이다.

In [None]:
# Extract the EXT_SOURCE variables and show correlations
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2',
                     'EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize=(8,6))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap=plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax=0.6)
plt.title('Correlation Heatmap');

In [None]:
plt.figure(figsize = (10,12))

# iterate through the sources
for i,source in enumerate(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']):
    
    #create a new subplot for each source
    plt.subplot(3,1,i+1)
    
    #plot repaid loans
    sns.kdeplot(app_train.loc[app_train['TARGET']==0, source], label='target==0')
    
    #plot loans that were not repaid
    sns.kdeplot(app_train.loc[app_train['TARGET']==1, source],label='target==1')
    
    # Label the plots
    plt.title('Distribution of %s by Target Value' % source)
    plt.xlabel('%s' % source); plt.ylabel('Density');

    plt.tight_layout(h_pad = 2.5)

In [None]:
## Pairs Plot은 단일 변수의 분포뿐만 아니라 여러 쌍의 변수 간의 관계를 볼 수 있게 해줌

In [None]:
# Copy the data for plotting
plot_data = ext_data.drop(columns = ['DAYS_BIRTH']).copy()

# Add in the age of the client in years
plot_data['YEARS_BIRTH'] = age_data['YEARS_BIRTH']

# Drop na values and limit to first 100000 rows
plot_data = plot_data.dropna().loc[:100000,:]

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x,y)[0][1]
    ax = plt.gca()
    ax.annotate("r={:.2f}".format(r),
               xy=(.2, .8), xycoords=ax.transAxes,
               size = 20)
    
# Create the pairgrid object
grid = sns.PairGrid(data=plot_data, size=3, diag_sharey=False,
                   hue = 'TARGET',
                   vars = [x for x in list(plot_data.columns)
                          if x != 'TARGET'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap=plt.cm.OrRd_r);

plt.suptitle('Ext Source and Age Features Pairs Plot', size=32, y=1.05);

### Feature Engineering

In [None]:
### 피쳐 엔지니어링은 일반적인 프로세스를 말하며 피쳐 구성(기존 데이터에서 새 피쳐 추가)과 피쳐 선택(가장 중요한 피쳐만 선택하거나 다른 차원 축소 방법)을 모두 포함할 수 있습니다. 
### 피처를 생성하고 피처를 선택하는 데 사용할 수 있는 많은 기술이 있습니다.

### Polynomial Features

In [None]:
## 다항 회귀란, 데이터들 간의 형태가 비선형일 때 데이터에 각 특성의 제곱을 추가해 주어서 특성이 추가된 비선형 데이터를 선형 회귀 모델로 훈련시키는 방법

## Scikit-Learn에는 지정된 정도까지 다항식과 상호 작용 항을 생성하는 PolynomialFeatures라는 유용한 클래스가 있습니다. 
## 결과를 보기 위해 차수 3을 사용할 수 있습니다(다항식 특징을 생성할 때 특징의 수가 차수에 따라 기하급수적으로 확장되고 문제가 발생할 수 있기 때문에 너무 높은 차수를 사용하는 것을 피하고 싶습니다. 과적합)

In [None]:
# Make a new dataframe for polynomial features
poly_features = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH','TARGET']]
poly_features_test = app_test[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]

# imputer for handling missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

poly_target = poly_features['TARGET']

poly_features = poly_features.drop(columns=['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures

# Create the polynomail object with specified degree
poly_transformer = PolynomialFeatures(degree=3)

In [None]:
# Train the polynomial features
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)

print('Polynomial Features shape: ', poly_features.shape)

In [None]:
## 다항식 기능

poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])[:15]

In [None]:
# Create a dataframe of the features
poly_features = pd.DataFrame(poly_features,
                            columns = poly_transformer.get_feature_names(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']))

# Add in the target
poly_features['TARGET'] = poly_target

# Find the correlations with the target
poly_corrs = poly_features.corr()['TARGET'].sort_values()

# Display most negative and most positive
print(poly_corrs.head(10))
print(poly_corrs.tail(5))

In [None]:
# Put test features into dataframe
poly_features_test = pd.DataFrame(poly_features_test, columns=poly_transformer.get_feature_names(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']))

# Merge polynomial features into training dataframe
poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR']
app_train_poly = app_train.merge(poly_features, on = 'SK_ID_CURR', how='left')

# Merge polnomial features into testing dataframe
poly_features_test['SK_ID_CURR'] = app_test['SK_ID_CURR']
app_test_poly = app_test.merge(poly_features_test, on = 'SK_ID_CURR', how='left')

# Align the dataframes
app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join='inner', axis=1)

# Print out the new shapes
print('Training data with polynomial features shape: ', app_train_poly.shape)
print('Testing data with polynomial features shape: ', app_test_poly.shape)

In [None]:
app_train_domain = app_train.copy()
app_test_domain = app_test.copy()

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT']/app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY']/app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY']/app_train_domain['AMT_CREDIT']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED']/app_train_domain['DAYS_BIRTH']

In [None]:
app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['CREDIT_TERM'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']

In [None]:
plt.figure(figsize = (12,20))

# iterate through the new features
for i, feature in enumerate(['CREDIT_INCOME_PERCENT','ANNUITY_INCOME_PERCENT','CREDIT_TERM','DAYS_EMPLOYED_PERCENT']):
    
    # create a new subplot for each source
    plt.subplot(4, 1, i+1)
    
    # plot repaid loans
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET']==0, feature], label='target==0')
    
    # plot loans that were not repaid
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET']==1, feature], label='target==1')
    
    # Label the plots
    plt.title('Distribution of %s by Target Value' % feature)
    plt.xlabel('%s' % feature); plt.ylabel('Density');
    
plt.tight_layout(h_pad = 2.5)

### Logistic Regression Implementation

In [None]:
### 기준선을 얻기 위해 범주형 변수를 인코딩한 후 모든 기능을 사용합니다. 
### 누락된 값을 채우고(대치) 기능의 범위를 정규화(기능 스케일링)하여 데이터를 전처리합니다. 

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Drop the target from the training data
if 'TARGET' in app_train:
    train = app_train.drop(columns=['TARGET'])
else:
    train = app_train.copy()
    
# Feature names
features = list(train.columns)

# Copy of the testing data
test = app_test.copy()

# Median imputation of missing values
imputer = SimpleImputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0,1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(app_test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

In [None]:
# 첫 번째 모델에 대해 Scikit-Learn의 LogisticRegression을 사용합니다. 
# 기본 모델 설정에서 변경할 수 있는 유일한 변경 사항은 과적합의 양을 제어하는 정규화 매개변수 C를 낮추는 것입니다(낮은 값은 과적합을 줄여야 함). 
# 이렇게 하면 기본 LogisticRegression보다 약간 더 나은 결과를 얻을 수 있지만 향후 모델에 대해서는 여전히 낮은 기준을 설정합니다.

In [None]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 0.0001)

# Train on the training data
log_reg.fit(train, train_labels)

In [None]:
## 첫번째 열은 대상이 0일 확률이고 두번째 열은 대상이 1일 확률
## 단일 행의 경우, 두 열의 합이 1이 되어야 함
## 우리는 대출이 상환되지 않을 확률을 원하므로 두 번째 열을 선택

In [None]:
# Make predictions
# Make sure to select the second column only
log_reg_pred = log_reg.predict_proba(test)[:,1]

In [None]:
# Submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.head()

In [None]:
# Save the submission to a csv file
submit.to_csv('log_reg_baseline.csv', index=False)

### Improved Model: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose=1, n_jobs=-1)

In [None]:
# Train on the training data
random_forest.fit(train, train_labels)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature':features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict_proba(test)[:, 1]

In [None]:
# Make a submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Save the submission dataframe
submit.to_csv('random_forest_baseline.csv', index = False)

### Make Predictions using Engineered Features

In [None]:
poly_features_names = list(app_train_poly.columns)

# Impute the polynomial features
imputer = SimpleImputer(strategy = 'median')

poly_features = imputer.fit_transform(app_train_poly)
poly_features_test = imputer.transform(app_test_poly)

# Scale the polynomial features
scaler = MinMaxScaler(feature_range = (0,1))

poly_features = scaler.fit_transform(poly_features)
poly_features_test = scaler.transform(poly_features_test)

random_forest_poly = RandomForestClassifier(n_estimators=100,
                                           random_state = 50, verbose = 1, n_jobs = -1)

In [None]:
# Train on the training data
random_forest_poly.fit(poly_features, train_labels)

# Make predictions on the test data
predictions = random_forest_poly.predict_proba(poly_features_test)[:,1]

In [None]:
# Make a submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Save the submission dataframe
submit.to_csv('random_forest_baseline_engineered.csv', index = False)

In [None]:
### Testing Domain Features

app_train_domain = app_train_domain.drop(columns = 'TARGET')

domain_features_names = list(app_train_domain.columns)

# Impute the domainnomial features
imputer = SimpleImputer(strategy = 'median')

domain_features = imputer.fit_transform(app_train_domain)
domain_features_test = imputer.transform(app_test_domain)

# Scale the domainnomial features
scaler = MinMaxScaler(feature_range = (0,1))

domain_features = scaler.fit_transform(domain_features)
domain_features_test = scaler.transform(domain_features_test)

random_forest_domain = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose=1, n_jobs = -1)

# Train on the training data
random_forest_domain.fit(domain_features, train_labels)

# Extract feature importances
feature_importance_values_domain = random_forest_domain.feature_importances_
feature_importance_domain = pd.DataFrame({'feature':domain_features_names,'importance':feature_importance_values_domain})

# Make predictions on the test data
predictions = random_forest_domain.predict_proba(domain_features_test)[:, 1]

In [None]:
# Make a submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Save the submission dataframe
submit.to_csv('random_forest_baseline_domain.csv', index=False)

In [None]:
def plot_feature_importances(df):
    """
    Plot importances returned by a model. This can work with any measure of
    feature importance provided that higher importance is better. 
    
    Args:
        df (dataframe): feature importances. Must have the features in a column
        called `features` and the importances in a column called `importance
        
    Returns:
        shows a plot of the 15 most importance features
        
        df (dataframe): feature importances sorted by importance (highest to lowest) 
        with a column for normalized importance
        """
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    return df

In [None]:
# Show the feature importances for the default features
feature_importances_sorted = plot_feature_importances(feature_importances)

### Conclusions

#### We followed the general outline of a machine learning project:

#### 1. Understand the problem and the data
#### 2. Data cleaning and formatting (this was mostly done for us)
#### 3. Exploratory Data Analysis
#### 4. Baseline model
#### 5. Improved model
#### 6. Model interpretation (just a little)

### Just for Fun: Light Gradient Boosting Machine

In [None]:
### LightGBM은 현재 구조화된 데이터 세트(특히 Kaggle에서) 학습을 위한 최고의 모델

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

def model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            # 오브젝트이면
            if features[col].dtype == 'object':
                
                #라벨 인코더를 진행
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object 
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    # 특성 중요도 배열
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    # 테스트 예측했을 때 넣을 배열
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    # validation할 때 나오는 예측값들 넣는 배열
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # 한 번 kfold를 돌고 모델을 훈련시킨 뒤 최고 성적을 가져오고
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        # 한 번 kfold 돌고 모델을 훈련시킨 뒤 그 훈련할 때 중요하게 본 특성을 가져와서 feature_importance_values에 넣어준다.
        # kfold만큼 도니까 나눠준다
        # 여기서는 특성 중요도를 kfold할 때마다 적용시킨 것이니까 나눠줘야 한다
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        # 예측을 넣어준다. 여기서도 kfold만큼 도니깐 나눠준다.
        # 여기서는 test니까 test 데이터 전체를 넣어줌. 
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        # validation에 대한 값을 넣어주고
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        # 최고 좋은 성적을 뽑아냄
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    # test에 대한 예측값을 넣어주고
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [None]:
## Out of Fold(OOF) 방법으로 모델 평가
### 모델의 성능을 평가하는 방법으로서, 실무보다는 Kaggle, Dacon과 같은 예측 알고리즘 대회에서 자주 사용되는 방식
### K-fold를 이용한 것이 OOF라고 할 수 있으며, OOF안에는 K-fold가 속한다고 볼 수 있다

### 참고 링크: https://techblog-history-younghunjo1.tistory.com/142

In [None]:
submission, fi, metrics = model(app_train, app_test)
print('Baseline metrics')
print(metrics)

In [None]:
fi_sorted = plot_feature_importances(fi)

In [None]:
submission.to_csv('baseline_lgb.csv', index=False)

In [None]:
app_train_domain['TARGET'] = train_labels

# Test the domain knowledge features
submission_domain, fi_domain, metrics_domain = model(app_train_domain, app_test_domain)

print('Baseline with domain knowledge features metrics')
print(metrics_domain)

In [None]:
fi_sorted = plot_feature_importances(fi_domain)

In [None]:
submission_domain.to_csv('baseline_lgb_domain_features.csv', index=False)