## ライブラリの読み込み

In [None]:
#ライブラリの読み込み
import numpy as np
import pandas as pd
import re
import pickle
import gc
import seaborn as sns
import matplotlib.pyplot as plt
#scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

#LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

## ファイルの読み込み

In [None]:
application_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
print(application_train.shape)
application_train.head()

かなりサイズが大きいので、処理の途中でメモリ不足になることがある。
それを回避する方法として、各カラムのデータに応じて、データ型を最適化することで、メモリ使用量を削減することが有効！

### メモリ削減のため各カラムに対してデータ型を最適化

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum()/1024**2
    print("Memory usuage of dataframe is: {:.2f}MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int": #文字列に変更したときの最初の3文字がintなら、intである。（当たり前。）
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass
    
    end_mem = df.memory_usage().sum()/1024**2
    print("Memory usage after optimization is: {:.2f}MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem)/start_mem))
    
    return df

In [None]:
application_train = reduce_mem_usage(application_train)

In [None]:
application_train["TARGET"].dtype

もとはint84だったのが、int8に変更されている。

## データセット作成

In [None]:
#説明変数に使うTARGETと、IDであるSK＿ID＿CURRは削除。また、欠損が多いものも削除。
X_train = application_train.drop(columns = ["TARGET", "SK_ID_CURR", "OCCUPATION_TYPE", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE", "EMERGENCYSTATE_MODE"])

#y_trainはSeries型
y_train = application_train["TARGET"]

#後々予測値を格納したテーブルに付与したいので、データフレーム型で作成しておく
id_train = application_train[["SK_ID_CURR"]]


### 欠損値補間

In [None]:
#カテゴリ変数をcat_colsに格納。
cat_cols = ["NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", 
                        "FLAG_OWN_REALTY", "NAME_TYPE_SUITE", "NAME_INCOME_TYPE",
                        "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE",
                         "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE"
           ]


In [None]:
#残ったカテゴリの欠損値をチェック。
X_train[cat_cols].isnull().sum()

In [None]:
#1292ある欠損値の分布をチェック。
plt.figure(figsize=(16,8))
sns.countplot(x = "NAME_TYPE_SUITE", data = X_train[["NAME_TYPE_SUITE"]])

Unaccompaniedが一番多いので、欠損値はすべてこれで補う。

In [None]:
#欠損値を最頻値で補間
X_train["NAME_TYPE_SUITE"] = X_train["NAME_TYPE_SUITE"].fillna(X_train["NAME_TYPE_SUITE"].mode()[0])

In [None]:
X_train[cat_cols].isnull().sum()

In [None]:
X_train

### ラベルエンコーディング

In [None]:
le = LabelEncoder()

for col in cat_cols:
    X_train[col] = le.fit_transform(X_train[col])

X_train[cat_cols].head()

In [None]:
X_train.info()

すべてのカテゴリ変数が数値になった！

## バリデーション設計

### 目的変数の1と0の割合を確認。

In [None]:
print("mean: {:.4f}".format(y_train.mean()))
y_train.value_counts()

## モデル学習から重要度算出、評価までを関数化

In [None]:
def train_lgb(input_X,
              input_y,
              input_id,
              params,
              list_nfold = [0,1,2,3,4],
              n_splits = 5
             ):
    train_oof = np.zeros(len(input_X))
    metrics = []
    imp = pd.DataFrame()
    
    #クロスバリデーション
    cv = list(StratifiedKFold(n_splits = n_splits, shuffle=True, random_state=123).split(input_X, input_y))
    
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        
        #データセット作成
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        X_tr, y_tr, id_tr = input_X.loc[idx_tr, :], input_y.loc[idx_tr], input_id.loc[idx_tr, :]
        X_va, y_va, id_va = input_X.loc[idx_va, :], input_y.loc[idx_va], input_id.loc[idx_va, :]
        
        print(X_tr.shape, y_tr.shape)
        
        #学習
        model = lgb.LGBMClassifier(**params)
        model.fit(X_tr,
                  y_tr,
                  eval_set = [(X_tr, y_tr), (X_va, y_va)],
                  callbacks = [lgb.early_stopping(stopping_rounds=100,
                                                   verbose=True),
                                lgb.log_evaluation(0)],
                  categorical_feature = cat_cols)
        
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "wb")as f:
            pickle.dump(model, f, protocol=4)
            
        #評価
        y_tr_pred = model.predict_proba(X_tr)[:, 1]
        y_va_pred = model.predict_proba(X_va)[:, 1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print("[auc] tr: {:.4f}, va: {:.4f}".format(metric_tr, metric_va))
        
        #oof
        train_oof[idx_va] = y_va_pred
        
        #imp
        _imp = pd.DataFrame({"col": input_X.columns, "imp": model.feature_importances_, "nfold": nfold})
        imp = pd.concat([imp, _imp])
        
    print("-"*20, "result", "-"*20)
    
    #metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(metrics[:,1].mean(), metrics[:,1].std(), metrics[:,2].mean(), metrics[:,2].std()))
    print("[oof]{:.4f}".format(roc_auc_score(input_y, train_oof)))
    
    #oof
    train_oof = pd.concat([input_id,
                           pd.DataFrame({"pred": train_oof})],
                          axis=1)
    
    #重要度算出
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop = False)
    imp.columns = ["col", "imp", "imp_std"]
    
    return train_oof, imp, metrics

In [None]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.05,
    "num_leaves": 32,
    "n_estimators": 100000,
    "random_state": 123,
    "importance_type": "gain"
}

n_splits = 5

train_oof, imp, metrics = train_lgb(X_train, y_train, id_train, params)

## 重要度の確認

In [None]:
imp.sort_values("imp", ascending=False).head(10)

## モデル推論

### 推論用データセットの作成

In [None]:
application_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
application_test = reduce_mem_usage(application_test)

X_test = application_test.drop(columns = ["SK_ID_CURR","OCCUPATION_TYPE", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE", "EMERGENCYSTATE_MODE"])
id_test = application_test[["SK_ID_CURR"]]

### ラベルエンコーディング

In [None]:
#最頻値で補間
X_test["NAME_TYPE_SUITE"] = X_test["NAME_TYPE_SUITE"].fillna(X_test["NAME_TYPE_SUITE"].mode()[0])

In [None]:
X_test[cat_cols].isnull().sum()

In [None]:
for col in cat_cols:
    X_test[col] = le.fit_transform(X_test[col])

X_test[cat_cols].head()

### 推論関数の定義

In [None]:
def predict_lgb(input_X, input_id, list_nfold = [0,1,2,3,4]):
    pred = np.zeros((len(input_X), len(list_nfold)))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "rb") as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(input_X)[:,1]
        
    pred = pd.concat([input_id, 
                      pd.DataFrame({"pred": pred.mean(axis=1)})],
                     axis = 1
                    )
    
    print("Done.")
    
    return pred

In [None]:
test_pred = predict_lgb(X_test, id_test, list_nfold=[0,1,2,3,4])

In [None]:
test_pred

## 提出ファイルの作成

In [None]:
df_submit = test_pred.rename(columns = {"pred": "TARGET"})
print(df_submit.shape)
display(df_submit.head(10))

#ファイル出力
df_submit.to_csv("submission_baseline.csv", index = None)