# 概要

Kerasを使うのは初めてで自分用のメモとして公開します


1. カテゴリ変数の削除
カテゴリ関数ctl_vehicleは常に目的変数0となることが知られているので学習に不要 Reason: notebook

2. Keras を使った単純な深層学習
・　BatchNormalization とDropOut層
・　アーリーストッピング
・　学習率のスケジュール
・　32バッチサイズ
https://arxiv.org/abs/1804.07612
↑バッチサイズは32くらいがいいよって論文
→バッチサイズをおおきくした方が良い結果が出る


3. KN-FOLD
Add Data からデータセットを加え、MultilabelStratifiedKFold のインポート

4. RankGauss
RankGauss で前処理


### 参考資料
- https://www.kaggle.com/simakov/keras-multilabel-neural-network-v1-2


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import QuantileTransformer
import tensorflow_addons as tfa
from sklearn.metrics import log_loss

import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
X_train = pd.read_csv('../input/lish-moa/train_features.csv')
y_train = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
X_test = pd.read_csv('../input/lish-moa/test_features.csv')

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
# RankGauss
# 数値変数を順位に変換したあと順位を保ったまま半ば無理やり正規分布になるように変換する手法
def rank_gauss(df):
    for col in df.columns:
        transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal')
        vec_len = len(df[col].values)
        raw_vec = df[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)

        # 変換後のデータで各列を置換
        df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]

    return df


In [None]:
def preprocess(df):
    df = df.copy()
    # カテゴリ変数を数値変換
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    
    df = rank_gauss(df)
    return df

カテゴリ関数ctl_vehicleは常に目的変数0となることが知られているので学習に不要
Reason: [notebook](https://www.kaggle.com/demetrypascal/t-test-pca-rfe-logistic-regression)

In [None]:
# cp_type == 0 のみ利用
# あわせてyも同じように落とす
y_train = y_train.loc[X_train['cp_type']=='trt_cp'].reset_index(drop=True)
X_train = X_train.loc[X_train['cp_type']=='trt_cp'].reset_index(drop=True)

train = preprocess(X_train)
test = preprocess(X_test)
del y_train['sig_id']

In [None]:
def create_model(num_columns):
    model = keras.models.Sequential([
        keras.layers.Input(num_columns),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(rate=0.2),
        tfa.layers.WeightNormalization(keras.layers.Dense(2048, activation='relu')),
        
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(rate=0.2),
        tfa.layers.WeightNormalization(keras.layers.Dense(1024, activation='relu')),
        
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(rate=0.5),
        tfa.layers.WeightNormalization(keras.layers.Dense(512, activation='relu')),                            
        
        # == final layer == 
         keras.layers.BatchNormalization(),
         keras.layers.Dropout(rate=0.5),
         tfa.layers.WeightNormalization(keras.layers.Dense(206, activation='sigmoid'))
    ])
    model.compile(
        optimizer=tfa.optimizers.Lookahead(tf.optimizers.Adam(),sync_period=10),
        loss='binary_crossentropy'
    )

    return model

In [None]:
top_feats = [i for i in range(train.shape[1])]
# len(train.columns) と同じ

In [None]:
# 現在のエポックを引数として学習率を返す関数を定義
# それまでの学習散るに0.1 ** (epoch/20)をかけていて学習率は指数関数的な減衰をする
# keras.optimizer.schedules を使う方法もある

def exponential_delay_fn(epoch):
    return 0.01 * 0.1 ** (epoch/20)

In [None]:
N_STARTS = 7
K_FOLD = 7
BATCH_NUM = 128
EPOCH_NUM = 35
tf.random.set_seed(1)
ss.loc[:, y_train.columns] = 0
res = y_train.copy()
res.loc[:, y_train.columns] = 0

historys = dict()

# N_STARS * KFOLD　回学習する
# MultilabelStratifielsKFold はランダムシャッフルをしてくれるgroup-k-fold のようなライブラリ
for seed in range(N_STARTS):
    for n, (tr, te) in enumerate(
        MultilabelStratifiedKFold(n_splits=K_FOLD,
                                  random_state=seed, shuffle=True).split(train, train)):
        print(f"--{train.values[tr].shape}--{train.values[te].shape}--")
        print(f"Seed: {seed}, Fold: {n}")
        
        # モデル作成
        model = create_model(len(top_feats))
        
        # === コールバック関数の設定 ===
        checkpoint_path = f'repeated:{seed}_Fold:{n}.hdf5'
        
        # スケジュール関数を引数としてLearningRateScheduler コールバックを作りそのコールバックをfit()メソッドに渡す
        # model.fit(...., callback=[exponential_delay_fn(epoch)])のように使う

        lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_delay_fn)

        # ベストなモデルをcheckpoint_path に保存しておいてくれる設定
        checkpoint_cb = keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True)
        # early_stopping のコールバック関数
        # val_loss を監視
        # modeは上書きするときの設定、基本的にautoにしておけばOK
        early_stopping_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
        
        # ===========================
        
        history = model.fit(train.values[tr][:, top_feats],
                  y_train.values[tr],
                  validation_data=(train.values[te][:, top_feats], y_train.values[te]),
                  epochs=EPOCH_NUM, 
                  batch_size=BATCH_NUM, 
                  callbacks=[checkpoint_cb, early_stopping_cb, lr_scheduler], 
                  verbose=2 #　エポックごとに1行のログを出力
                 )
        
        historys[f'history_{seed+1}'] = history
        
        model.load_weights(checkpoint_path)
        test_predict = model.predict(test.values[:, top_feats])
        
        # 自己評価用にvalidationモデルに対する予測も保存しておく
        val_predict = model.predict(train.values[te][:, top_feats]) 
        
        ss.loc[:, y_train.columns] += test_predict
        res.loc[te, y_train.columns] += val_predict
        print('')

# 最終的に足し合わされているので試行回数分の平均を取る
ss.loc[:, y_train.columns] /= ((n+1) * N_STARTS)
# val についてはN_STARTS分
res.loc[:, y_train.columns] /= N_STARTS
        


In [None]:
# Show Model loss in plots
#　訓練の可視化
# https://keras.io/ja/visualization/

for k,v in historys.items():
    loss = []
    val_loss = []
    loss.append(v.history['loss'][:40])
    val_loss.append(v.history['val_loss'][:40])
    
import matplotlib.pyplot as plt
plt.figure(figsize = (15, 6))
plt.plot(np.mean(loss, axis=0))
plt.plot(np.mean(val_loss, axis=0))
plt.yscale('log')
plt.yticks(ticks=[1,1e-1,1e-2])
plt.xlabel('Epochs')
plt.ylabel('Average Logloss')
plt.legend(['Training','Validation'])
plt.show()

In [None]:
def metric(y_true, y_pred):
    metrics = []
    for _target in y_train.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float)))
    return np.mean(metrics)

In [None]:
# OOF (Out of Fold)とは、k-Fold などでデータを分割した際に学習に使わなかったデータを指す
# OOF に対してlog_loss の平均を出力
print(f'OOF Metric :{metric(y_train, res)}')

In [None]:
# 予測時には省いていたcp_type=1 のカラムは強制的に0を代入する
# でももともと全部0っぽい
ss.loc[test['cp_type']==1, y_train.columns] = 0
ss.to_csv('submission1.csv', index=False)

In [None]:
del historys

# XGBOOST


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
X_train = pd.read_csv('../input/lish-moa/train_features.csv')
y_train = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
X_test = pd.read_csv('../input/lish-moa/test_features.csv')

# cp_type == 0 のみ利用
# あわせてyも同じように落とす
y_train = y_train.loc[X_train['cp_type']=='trt_cp'].reset_index(drop=True)
X_train = X_train.loc[X_train['cp_type']=='trt_cp'].reset_index(drop=True)

train = preprocess(X_train)
test = preprocess(X_test)
del y_train['sig_id']

# drop id col
X = train.to_numpy()
X_test = test.to_numpy()
y = y_train.to_numpy()

In [None]:

classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))

clf = Pipeline([('encode', CountEncoder(cols=[0, 2])),
                ('classify', classifier)
               ])

params = {'classify__estimator__colsample_bytree': 0.6522,
          'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0503,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 31.5800,
          'classify__estimator__n_estimators': 166,
          'classify__estimator__subsample': 0.8639
         }

_ = clf.set_params(**params)


control_mask = train['cp_type']=='ctl_vehicle'
oof_preds[control_mask] = 0

print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(oof_preds)))

In [None]:
N = 7
oof_preds = np.zeros(y.shape)
test_preds = np.zeros((test.shape[0], y.shape[1]))
oof_losses = []
kf = KFold(n_splits=N)
for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print('Starting fold: ', fn)
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]

    
    clf.fit(X_train, y_train)
    val_preds = clf.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)
    preds = clf.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T # take the positive class
    test_preds += preds / N
    
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))

In [None]:
# set control train preds to 0
control_mask = train['cp_type']=='ctl_vehicle'
oof_preds[control_mask] = 0

print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(oof_preds)))

In [None]:
sub = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
# create the submission file
sub.iloc[:,1:] = test_preds
sub.to_csv('submission2.csv', index=False)

# Ensumble

In [None]:
stack = (oof_preds + res)/2

In [None]:
stack.to_csv('submission.csv', index=False)

In [None]:
# stacked_test_pred = np.columns_stack(test_preds, ss)
# meta_model_pred = meta_model.predict(stacked_test_pred)

In [None]:
# meta_model_pred.to_csv('submission_stacked,csv', index=False)

In [None]:
# assert(len(meta_model_pred) == len(ss))