## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm
import time

import joblib

from sklearn.preprocessing import StandardScaler    # RobustScaler
from sklearn.model_selection import KFold    # GroupKFold

# Warningの無効化
import warnings
warnings.simplefilter("ignore")

# データフレームcolumの全表示
pd.set_option("display.max_columns", None)

## Load Data

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            #else:
            elif str(col_type)[:5] == "float":
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def read_data_strict(file_name="/kaggle/input/ump-train-picklefile/train.pkl"):
    df = pd.read_pickle(file_name).pipe(reduce_mem_usage)
    assert df.isnull().any().sum() == 0, "null exists."
    return df

In [None]:
# df_train = pd.read_pickle("../input/ump-train-picklefile/train.pkl")
df_train = read_data_strict()
df_train

In [None]:
# df_train.info()

In [None]:
# df_train.describe()

## Training

In [None]:
keys = ["time_id", "investment_id"]
features = list(df_train.filter(like="f_").columns)

In [None]:
# データのスケーリング
scaler = StandardScaler()    # RobustScaler()
df_train[features] = scaler.fit_transform(df_train[features])

In [None]:
joblib.dump(scaler, "scaler.joblib")

In [None]:
train_x = df_train[keys + features] #.values
train_y = df_train["target"] #.values

In [None]:
# time_id列のtime_idを単位として分割することにする
time_id = train_x["time_id"]
unique_time_ids = time_id.unique()

In [None]:
n_folds = 5

In [None]:
# tensorflowの警告抑制
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
# -----------------------------------
# ニューラルネットの実装
# -----------------------------------
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.metrics import log_loss

In [None]:
# -----------------------------------
# アーリーストッピング
# -----------------------------------
from keras.callbacks import EarlyStopping

# 学習の実行
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
batch_size = 1024
epochs = 50

In [None]:
train_x["preds"] = 0

In [None]:
# KFoldクラスを用いて、time_id単位で分割する
kf = KFold(n_splits = n_folds, shuffle = False, random_state = 71)
for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(time_id)):
    # time_idをtrain/valid（学習に使うデータ、バリデーションデータ）に分割する
    tr_x, tr_y = train_x.iloc[tr_group_idx], train_y.iloc[tr_group_idx]
    va_x, va_y = train_x.iloc[va_group_idx], train_y.iloc[va_group_idx]
    # 各レコードのtime_idがtrain/validのどちらに属しているかによって分割する

    # ニューラルネットモデルの構築
    model = Sequential()
    model.add(Dense(256, activation="relu", input_shape=(tr_x.shape[1],)))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(loss="mean_squared_error",   # "mean_squared_logarithmic_error",
                  optimizer="adam", metrics=["accuracy"])

    # アーリーストッピングの観察するroundを20とする
    # restore_best_weightsを設定することで、最適なエポックでのモデルを使用する
    early_stopping = EarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True)

    model.fit(tr_x, tr_y,
            batch_size=batch_size, epochs=epochs,
            verbose=1, validation_data=(va_x, va_y), callbacks=[early_stopping])

    joblib.dump(model, f"catb_{fold}.pkl")
    
    # 予測
    va_pred = model.predict(va_x[features])
    train_x.loc[va_group_idx, "preds"] = va_pred

## Predict & submit

In [None]:
import ubiquant

In [None]:
env = ubiquant.make_env()                   # initialize the environment
iter_test = env.iter_test()                 # an iterator which loops over the test set and sample submission

In [None]:
scaler = joblib.load("scaler.joblib")
models = [joblib.load(f"catb_{fold}.pkl") for fold in range(n_folds)]

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df[features] = scaler.fit_transform(test_df[features]) 
    final_pred = [models[fold].predict(test_df[features]) for fold in range(n_folds)]
    sample_prediction_df["target"] = np.mean(np.stack(final_pred), axis = 0)
    env.predict(sample_prediction_df) 