In [None]:
import numpy as np 
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn import preprocessing, metrics
from sklearn.ensemble import RandomForestRegressor 
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.svm import SVR

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)
InteractiveShell.ast_node_interactivity = "all"

# データ読み込み

In [None]:
# パスの設定
input_path = '../input/home-credit-default-risk/'

application_train = pd.read_csv(input_path+'application_train.csv')
application_test = pd.read_csv(input_path+'application_test.csv')
bureau = pd.read_csv(input_path+'bureau.csv')
bureau_balance = pd.read_csv(input_path+'bureau_balance.csv')
previous_application = pd.read_csv(input_path+'previous_application.csv')
POS_CASH_balance = pd.read_csv(input_path+'POS_CASH_balance.csv')
installments_payments = pd.read_csv(input_path+'installments_payments.csv')
credit_card_balance = pd.read_csv(input_path+'credit_card_balance.csv')

# データの確認

In [None]:
# データ量の確認
print('Size of application_train data', application_train.shape)
print('Size of application_test data', application_test.shape)
print('Size of bureau data', bureau.shape)
print('Size of bureau_balance data', bureau_balance.shape)
print('Size of previous_application data', previous_application.shape)
print('Size of POS_CASH_balance data', POS_CASH_balance.shape)
print('Size of installments_payments data', installments_payments.shape)
print('Size of credit_card_balance data', credit_card_balance.shape)

In [None]:
# データ確認
## train データ確認
print("\n======= train data =======\n")
application_train.shape
application_train.columns
application_train.info()
application_train.head(5)

## test データ確認
print("\n======= test data =======\n")
application_test.shape
application_test.columns
application_test.info()
application_test.head(5)

## bureau.csv データ確認
print("\n======= bureau data =======\n")
bureau.shape
bureau.columns
bureau.info()
bureau.head(5)

## bureau_balance.csv データ確認
print("\n======= bureau_balance data =======\n")
bureau_balance.shape
bureau_balance.columns
bureau_balance.info()
bureau_balance.head(5)

## previous_application.csv データ確認
print("\n======= previous_application data =======\n")
previous_application.shape
previous_application.columns
previous_application.info()
previous_application.head(5)

## POS_CASH_balance.csv データ確認
print("\n======= POS_CASH_balance data =======\n")
POS_CASH_balance.shape
POS_CASH_balance.columns
POS_CASH_balance.info()
POS_CASH_balance.head(5)

## installments_payments.csv データ確認
print("\n======= installments_payment data =======\n")
installments_payments.shape
installments_payments.columns
installments_payments.info()
installments_payments.head(5)

## credit_card_balance.csv データ確認
print("\n======= credit_card_balance data =======\n")
credit_card_balance.shape
credit_card_balance.columns
credit_card_balance.info()
credit_card_balance.head(5)

In [None]:
# 基礎統計量を確認
## train 基礎統計量確認
print("\n======= train data =======\n")
application_train.describe().T

## test 基礎統計量確認
print("\n======= test data =======\n")
application_test.describe().T

## bureau.csv 基礎統計量確認
print("\n======= bureau data =======\n")
bureau.describe().T

## bureau_balance.csv 基礎統計量確認
print("\n======= bureau_balance data =======\n")
bureau_balance.describe().T

## previous_application.csv 基礎統計量確認
print("\n======= previous_application data =======\n")
previous_application.describe().T

## POS_CASH_balance.csv 基礎統計量確認
print("\n======= POS_CASH_balance data =======\n")
POS_CASH_balance.describe().T

## installments_payments.csv 基礎統計量確認
print("\n======= installments_payment data =======\n")
installments_payments.describe().T

## credit_card_balance.csv 基礎統計量確認
print("\n======= credit_card_balance data =======\n")
credit_card_balance.describe().T

# ランダムサンプリング
ランダムサンプリングを行った際にサンプル内のデフォルトしたか否かの割合が変わらないことを確認したい

In [None]:
# trainのtarget=1となるサンプルの数を確認
print(application_train[application_train['TARGET']==1]['TARGET'].sum())

In [None]:
# サンプル数が1/10倍になるようランダムサンプリング
_,application_train = train_test_split(application_train, test_size=0.1, stratify=application_train['TARGET'], random_state=42)
#application_train=application_train.sample(frac=0.1, random_state=2)

# サンプリング後のtrainのtarget=1となるサンプルの数の割合に変化がないか確認
print(application_train[application_train['TARGET']==1]['TARGET'].sum())

In [None]:
# ランダムサンプリング後のデータの確認、基礎統計量
## データの確認
print("\n======= train data =======\n")
application_train.shape
application_train.columns
application_train.info()
application_train.head(5)

## 基礎統計量
print("\n======= train describe =======\n")
application_train.describe().T

# 学習データとテストデータを結合

In [None]:
# 学習データとテストデータを縦に結合
all_df = pd.concat([application_train,application_test])

# 結合データを確認
print("\n======= all data =======\n")
all_df.shape
all_df.columns
all_df.info()
all_df.head(5)

## 基礎統計量
print("\n======= all describe =======\n")
all_df.describe().T

In [None]:
# 後程使うtrain, testのIDを保存
## train, testよりSK_ID_CURR列の要素をそれぞれリスト化
train_list = application_train['SK_ID_CURR'].tolist()
test_list = application_test['SK_ID_CURR'].tolist()

# メモリ節約のためtrain, testのメモリを解放
del application_train, application_test

# 欠損状況の確認と処理

In [None]:
# 欠損値処理用の関数
def missing_handle(df):
    
    # 目的変数を持つall_dfのみ別処理
    if "TARGET" in list(df.columns):
        for col in df.columns:
            if col != 'TARGET':
                if df[col].dtype=="O":
                    df[col] = df[col].fillna("None")
                else:
                    df[col] = df[col].fillna(-9999)
                    
    # all_df以外はこちらで処理
    else:
        for col in df.columns:
            if df[col].dtype=="O":
                df[col] = df[col].fillna("None")
            else:
                df[col] = df[col].fillna(-9999)

In [None]:
# 欠損値の処理
## 欠損状況の確認
## train/testデータの確認
print("\n======= 欠損状況確認 =======\n")
all_df.isnull().sum()

print("\n======= 欠損割合確認 =======\n")
all_df.isnull().sum()/len(all_df)

## train/testデータの処理
## object型の時はNone, それ以外の場合は-9999で欠損値処理
missing_handle(all_df)
        
# 再度欠損状況の確認
print("\n======= 欠損処理後の状況を確認 =======\n")
all_df.isnull().sum()

In [None]:
# 欠損値の処理
## 欠損状況の確認
## credit_card_balanceデータの確認
print("\n======= 欠損状況確認 =======\n")
credit_card_balance.isnull().sum()

print("\n======= 欠損割合確認 =======\n")
credit_card_balance.isnull().sum()/len(credit_card_balance)

## credit_card_balanceデータの処理
## object型の時はNone, それ以外の場合は-9999で欠損値処理
missing_handle(credit_card_balance)
        
# 再度欠損状況の確認
print("\n======= 欠損処理後の状況を確認 =======\n")
credit_card_balance.isnull().sum()

In [None]:
# 欠損値の処理
## 欠損状況の確認# 欠損状況の確認
## POS_CASH_balanceデータの確認
print("\n======= 欠損状況確認 =======\n")
POS_CASH_balance.isnull().sum()

## POS_CASH_balanceデータの確認
print("\n======= 欠損割合確認 =======\n")
POS_CASH_balance.isnull().sum()/len(POS_CASH_balance)

## POS_CASH_balanceデータの処理
## object型の時はNone, それ以外の場合は-9999で欠損値処理
missing_handle(POS_CASH_balance)
        
# 再度欠損状況の確認
print("\n======= 欠損処理後の状況を確認 =======\n")
POS_CASH_balance.isnull().sum()

In [None]:
# 欠損値の処理
## 欠損状況の確認
## previous_applicationデータの確認
print("\n======= 欠損状況確認 =======\n")
previous_application.isnull().sum()

print("\n======= 欠損割合確認 =======\n")
previous_application.isnull().sum()/len(previous_application)

## previous_applicationデータの処理
## object型の時はNone, それ以外の場合は-9999で欠損値処理
missing_handle(previous_application)
        
# 再度欠損状況の確認
print("\n======= 欠損処理後の状況を確認 =======\n")
previous_application.isnull().sum()

In [None]:
# 欠損値の処理
# 欠損状況の確認
## bureauデータの確認
print("\n======= 欠損状況確認 =======\n")
bureau.isnull().sum()

print("\n======= 欠損割合確認 =======\n")
bureau.isnull().sum()/len(bureau)

## bureauデータの処理
## object型の時はNone, それ以外の場合は-9999で欠損値処理
missing_handle(bureau)
        
# 再度欠損状況の確認
print("\n======= 欠損処理後の状況を確認 =======\n")
bureau.isnull().sum()

# 特徴量作成

## 特徴量の作成

In [None]:
# 他社の過去ローンの返済金額割合
## （返済金額）/（ローン総額）
### ローン総額が0以下となる場合返済金額割合を0とする
bureau['REPAYMENT_RATE'] = bureau['AMT_CREDIT_SUM_DEBT'].where(bureau['AMT_CREDIT_SUM']>0, 0) / bureau['AMT_CREDIT_SUM'].where(bureau['AMT_CREDIT_SUM']>0, 1)
bureau['OVERDUE_RATE'] = bureau['AMT_CREDIT_SUM_OVERDUE'].where(bureau['AMT_CREDIT_SUM']>0, 0) / bureau['AMT_CREDIT_SUM'].where(bureau['AMT_CREDIT_SUM']>0, 1)

# クレジットカードのローン残金割合
## （返済金額）/（ローン総額）
### ローン総額が0以下の場合残金割合を0とする
credit_card_balance['CREDIT_REPAYMENT_RATE'] = credit_card_balance['AMT_RECIVABLE'].where(credit_card_balance['AMT_TOTAL_RECEIVABLE']>0, 0) / credit_card_balance['AMT_TOTAL_RECEIVABLE'].where(credit_card_balance['AMT_TOTAL_RECEIVABLE']>0, 1)

In [None]:
# 各データフレームの必要カラムのみを抽出
## データフレーム毎の必要カラム一覧
### application_{train/test}.csv
application_list = [
    "AMT_INCOME_TOTAL",
    "DAYS_BIRTH",
    "OBS_30_CNT_SOCIAL_CIRCLE",
    "AMT_CREDIT",
    "AMT_ANNUITY",
    "NAME_TYPE_SUITE",
    "AMT_GOODS_PRICE",
]

### credit_card_balance.csv
credit_balance_list = [
    "AMT_CREDIT_LIMIT_ACTUAL",
    # 特徴量追加用
    "CREDIT_REPAYMENT_RATE",
    "AMT_TOTAL_RECEIVABLE",
]

### POS_CASH_balance.csv
pos_balance_list = [
    "SK_DPD",
]

### previous_application.csv
previous_application_list = [
    "AMT_CREDIT",
    "RATE_INTEREST_PRIMARY",
    "RATE_INTEREST_PRIVILEGED",
    "AMT_ANNUITY",
    "NAME_CONTRACT_STATUS",
    "NAME_CLIENT_TYPE",
    "NAME_TYPE_SUITE",
]

### bureau.csv
bureau_list = [
    "AMT_CREDIT_SUM",
    "AMT_ANNUITY",
    "CREDIT_ACTIVE",
    "REPAYMENT_RATE",
    "CREDIT_DAY_OVERDUE",
    "CNT_CREDIT_PROLONG",
]

## 必要カラムのみ抽出
### bureau, previous_applicationに関してはall_dfと同名のカラム名をリネーム処理
all_df = all_df[['SK_ID_CURR','TARGET'] + application_list]
bureau = bureau[['SK_ID_CURR'] + bureau_list].rename(columns={'AMT_ANNUITY': 'AMT_ANNUITY_BUREAU'}) # 同名のカラム名をリネーム処理
previous_application = previous_application[['SK_ID_CURR'] + previous_application_list].rename(columns={'AMT_ANNUITY': 'AMT_ANNUITY_PREVIOUS'}) # 同名のカラム名をリネーム処理
POS_CASH_balance = POS_CASH_balance[['SK_ID_CURR'] + pos_balance_list]
credit_card_balance = credit_card_balance[['SK_ID_CURR'] + credit_balance_list]

## カテゴリ変数をダミー変数化

In [None]:
# カテゴリ変数をダミー変数化
## all_dfのNAME_TYPE_SUITEを変換
all_df = pd.get_dummies(all_df, columns=['NAME_TYPE_SUITE'])

## bureauのCREDIT_ACTIVEを変換
bureau = pd.get_dummies(bureau, columns=['CREDIT_ACTIVE'])

## previous_applicationのNAME_CONTRACT_STATUS, NAME_CLIENT_TYPEを変換
previous_application = pd.get_dummies(previous_application, columns=['NAME_CONTRACT_STATUS', 'NAME_CLIENT_TYPE', 'NAME_TYPE_SUITE'])

In [None]:
# 必要データ抽出後のデータ確認
## train/test データ確認
print("\n======= train/test data =======\n")
all_df.shape
all_df.columns
all_df.info()
all_df.head(5)

## bureau.csv データ確認
print("\n======= bureau data =======\n")
bureau.shape
bureau.columns
bureau.info()
bureau.head(5)

## previous_application.csv データ確認
print("\n======= previous_application data =======\n")
previous_application.shape
previous_application.columns
previous_application.info()
previous_application.head(5)

## POS_CASH_balance.csv データ確認
print("\n======= POS_CASH_balance data =======\n")
POS_CASH_balance.shape
POS_CASH_balance.columns
POS_CASH_balance.info()
POS_CASH_balance.head(5)

## credit_card_balance.csv データ確認
print("\n======= credit_card_balance data =======\n")
credit_card_balance.shape
credit_card_balance.columns
credit_card_balance.info()
credit_card_balance.head(5)

## ID毎にカラムを集計
all_df以外のデータはSK_ID_CURRに紐づくデータが複数存在する可能性があるため、最大値や平均といった値を取り分析に利用する

In [None]:
# 各データフレームで集計する項目を辞書化
## bureauの辞書
aggregations_bureau = {
    "AMT_CREDIT_SUM":['max', 'mean'],
    "AMT_ANNUITY_BUREAU":['max', 'mean'],
    "REPAYMENT_RATE":['min', 'max', 'mean'],
    "CREDIT_DAY_OVERDUE":['max', 'mean'],
    "CNT_CREDIT_PROLONG":['max', 'mean'],
    "CREDIT_ACTIVE_Active":['count'],
    "CREDIT_ACTIVE_Bad debt":['count'],
    "CREDIT_ACTIVE_Closed":['count'],
    "CREDIT_ACTIVE_Sold":['count'],
}

## previous_applicationの辞書
aggregations_previous = {
    "AMT_CREDIT":['max', 'mean'],
    "RATE_INTEREST_PRIMARY":['min', 'max', 'mean'],
    "RATE_INTEREST_PRIVILEGED":['min', 'max', 'mean'],
    "AMT_ANNUITY_PREVIOUS":['max', 'mean'],
    "NAME_CONTRACT_STATUS_Approved":['count'],
    "NAME_CONTRACT_STATUS_Refused":['min', 'count'],
    "NAME_CLIENT_TYPE_New":['min'],
    "NAME_CLIENT_TYPE_Refreshed":['min', 'count'],
    "NAME_CLIENT_TYPE_Repeater":['count'],
    "NAME_TYPE_SUITE_Children":['count', 'max'],
    "NAME_TYPE_SUITE_Family":['count', 'max'],
    "NAME_TYPE_SUITE_Group of people":['count', 'max'],
    "NAME_TYPE_SUITE_Other_A":['count', 'max'],
    "NAME_TYPE_SUITE_Other_B":['count', 'max'],
    "NAME_TYPE_SUITE_Spouse, partner":['count', 'max'],
    "NAME_TYPE_SUITE_Unaccompanied":['count', 'max'],
}

## POS_CASHの辞書
aggregations_pos = {
    "SK_DPD":['max', 'mean'],
}

## credit_cardの辞書
aggregations_credit_card = {
    "AMT_CREDIT_LIMIT_ACTUAL":['min', 'max', 'mean'],
    "CREDIT_REPAYMENT_RATE":['min', 'max', 'mean'],
    "AMT_TOTAL_RECEIVABLE":['max', 'mean'],
}

In [None]:
# 各データフレームで集計
## bureau
bureau_agg = bureau.groupby('SK_ID_CURR').agg(aggregations_bureau)
## カラム名が二重になるため修正
bureau_agg.columns = pd.Index(['bureau_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
## SK_ID_CURRがインデックスとなっているので列に追加
bureau_agg = bureau_agg.reset_index()

## previous_application
previous_agg = previous_application.groupby('SK_ID_CURR').agg(aggregations_previous)
## カラム名が二重になるため修正
previous_agg.columns = pd.Index(['previous_' + e[0] + "_" + e[1].upper() for e in previous_agg.columns.tolist()])
## SK_ID_CURRがインデックスとなっているので列に追加
previous_agg = previous_agg.reset_index()

## POS_CASH_balance
pos_agg = POS_CASH_balance.groupby('SK_ID_CURR').agg(aggregations_pos)
## カラム名が二重になるため修正
pos_agg.columns = pd.Index(['pos_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
## SK_ID_CURRがインデックスとなっているので列に追加
pos_agg = pos_agg.reset_index()

## credit_card_balance
credit_agg = credit_card_balance.groupby('SK_ID_CURR').agg(aggregations_credit_card)
## カラム名が二重になるため修正
credit_agg.columns = pd.Index(['credit_' + e[0] + "_" + e[1].upper() for e in credit_agg.columns.tolist()])
## SK_ID_CURRがインデックスとなっているので列に追加
credit_agg = credit_agg.reset_index()

In [None]:
# 集計後のデータ確認
## bureau.csv データ確認
print("\n======= bureau data =======\n")
bureau_agg.shape
bureau_agg.columns
bureau_agg.info()
bureau_agg.head(5)

## previous_application.csv データ確認
print("\n======= previous_application data =======\n")
previous_agg.shape
previous_agg.columns
previous_agg.info()
previous_agg.head(5)

## POS_CASH_balance.csv データ確認
print("\n======= POS_CASH_balance data =======\n")
pos_agg.shape
pos_agg.columns
pos_agg.info()
pos_agg.head(5)

## credit_card_balance.csv データ確認
print("\n======= credit_card_balance data =======\n")
credit_agg.shape
credit_agg.columns
credit_agg.info()
credit_agg.head(5)

In [None]:
# 基礎統計量を確認
## bureau.csv 基礎統計量確認
print("\n======= bureau data =======\n")
bureau_agg.describe().T

## previous_application.csv 基礎統計量確認
print("\n======= previous_application data =======\n")
previous_agg.describe().T

## POS_CASH_balance.csv 基礎統計量確認
print("\n======= POS_CASH_balance data =======\n")
pos_agg.describe().T

## credit_card_balance.csv 基礎統計量確認
print("\n======= credit_card_balance data =======\n")
credit_agg.describe().T

## all_dfとbureauを列方向へ結合
列方向の結合はデータフレーム一つ一つを順にmergeを用いて結合させる必要がある

In [None]:
# PKが想定通りに設定されているかを確認
## 結合前のSK_ID_CURRの個数を確認
print(len(all_df['SK_ID_CURR']))

# train/testデータに目的変数を左外部結合
## 左外部結合の場合mergeを利用し一つずつ結合させなければならない
all_df = (
    all_df
    .merge(bureau_agg, how='left', on='SK_ID_CURR')
    .merge(previous_agg, how='left', on='SK_ID_CURR')
    .merge(pos_agg, how='left', on='SK_ID_CURR')
    .merge(credit_agg, how='left', on='SK_ID_CURR')
)

#結合後のSK_ID_CURRの個数を確認
print(len(all_df['SK_ID_CURR']))

In [None]:
# 改めて欠損値の処理
## 欠損状況の確認
## train/testデータの確認
print("\n======= 欠損状況確認 =======\n")
all_df.isnull().sum()

## train/testデータの処理
## object型の時はNone, それ以外の場合は-9999で欠損値処理
missing_handle(all_df)
        
# 再度欠損状況の確認
print("\n======= 欠損処理後の状況を確認 =======\n")
all_df.isnull().sum()

In [None]:
# 結合データを確認
print("\n======= all data =======\n")
all_df.shape
all_df.columns
all_df.info()
all_df.head(5)

## 基礎統計量
print("\n======= all describe =======\n")
all_df.describe().T

## 強相関の除去

In [None]:
corr_mat = all_df.corr(method='pearson')

sns.set(rc = {'figure.figsize':(18,11)})
sns.heatmap(corr_mat,
            vmin=-1.0,
            vmax=1.0,
            center=0,
            annot=True, # True:格子の中に値を表示
            fmt='.1f',
            xticklabels=corr_mat.columns.values,
            yticklabels=corr_mat.columns.values
           )
plt.show()

In [None]:
# 強相関変数の片方を除外
dtype_list = all_df.dtypes
numerical_variables = dtype_list.index
corrmat = all_df[numerical_variables].corr()

drop_tar = []
THRESHOLD = 0.8 # 閾値
for i, col in enumerate(corrmat.columns):
    drop_tar_flg = 0
    if i == 0:
        pass
    else:
        for j in range(i-1):
            if abs(corrmat.iloc[i,j]) > THRESHOLD:
                print(f"drop_tar >> col:{col}, index:{corrmat.index[j]}, corr_coef:{corrmat.iloc[i,j]}")
                drop_tar_flg = 1
    if drop_tar_flg == 1:
        drop_tar += [col]

print(f"""
----------------------------------------
除外対象 :
{drop_tar}
----------------------------------------
len(drop_tar) : {len(drop_tar)}
len(np.unique(drop_tar)) : {len(np.unique(drop_tar))}
""")


all_df.drop(drop_tar ,axis=1, inplace=True)

print(f"""
除外結果カラム一覧 :
{all_df.columns}
""")