In [None]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)
from tqdm import tqdm
import time
import gc

In [None]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Conv1D, MaxPool1D, LSTM
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.layers import Input, Dense, Activation, Dropout, Flatten

import tensorflow.keras.callbacks as callbacks
import tensorflow as tf
from keras import backend as K

In [None]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

## targetデータ加工

In [None]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Read train and test
train, test = read_train_test()

In [None]:
pricelist = ['bid_price1','ask_price1','bid_price2','ask_price2','wap1']

train_stock_ids = train['stock_id'].unique()
test_stock_ids = test['stock_id'].unique()

In [None]:
# 今回使用しない
def book_preprocessor(file_path, col, stock_id):
    df = pd.read_parquet(file_path)
    if col == "wap1":
        df['wap1'] = calc_wap1(df)
    seconds = pd.DataFrame(index=range(0,600)).rename_axis('seconds_in_bucket')
    tmp = pd.pivot_table(df, index=["time_id"], columns=["seconds_in_bucket"], values=[col]).T#.reset_index()
    tmp = tmp.set_axis(tmp.columns.tolist(),axis="columns").reset_index().drop(["level_0"],axis=1).set_index('seconds_in_bucket')
    tmp = pd.concat([seconds, tmp], axis=1).fillna(method='ffill').fillna(method='bfill').reset_index(drop=True).T.reset_index()
    tmp.columns = ["row_id"] + [f'{col}_t{i}' for i in range(600)]
    tmp["row_id"] = tmp["row_id"].apply(lambda x: f'{stock_id}-{x}')
    return tmp

def trade_1dcnn_preprocessor(file_path, stock_id,bins):
    df = pd.read_parquet(file_path)
    df['sec_cut'] = pd.cut(df['seconds_in_bucket'],bins = np.arange(0,600.1,bins).tolist(),right=False,labels=np.arange(0,600,bins).tolist())
    df = df[['time_id','sec_cut','price']].groupby(['time_id','sec_cut']).last().reset_index()
    df = pd.pivot_table(df, index=["time_id"], columns=["sec_cut"], values=['price'])
    df = df.set_axis(np.arange(0,600,bins).astype(str).tolist(),axis="columns")
    df.columns = [f'{col}sec' for col in df.columns]
    df = df.reset_index()
    df['stock_id'] = stock_id
    return df

In [None]:
def preprocessor(list_stock_ids, is_train=True, bins=60):
    
    # Parrallel for loop
    def for_joblib(stock_id, bins=60):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)

        trade = trade_1dcnn_preprocessor(file_path_trade,stock_id,bins)
        return trade

    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id,bins) for stock_id in list_stock_ids)
    df = pd.concat(df, ignore_index = True)
    
    # fillna
    keys = ['stock_id','time_id']
    tmp = df[keys]
    values = df.drop(keys, axis=1).T.fillna(method='ffill').fillna(1).T
    df = pd.concat([tmp,values],axis=1)
    
    return df

In [None]:
#秒数を指定
bins=30
train_ = preprocessor(train_stock_ids, is_train = True, bins=30)

In [None]:
train_.head()

## targetと特徴量をマージ

In [None]:
train = train.merge(train_, on=['stock_id','time_id'],how='left')
#欠損がある行を確認
display(train.loc[train.isna().any(axis=1)].shape)
display(train.loc[train.isna().any(axis=1)].head())
##欠損がある行を削除?　1で埋めるか？
train = train.loc[~train.isna().any(axis=1)]

In [None]:
tlen = len(np.arange(0,600,bins))
y = np.array(train['target']).reshape(-1,1)
X = np.array(train.drop(['stock_id','time_id','target','row_id'],axis=1)).reshape(-1,tlen,1)

In [None]:
X.shape

## 1D CNN

In [None]:
def root_mean_squared_per_error(y_true, y_pred):
         return K.sqrt(K.mean(K.square( (y_true - y_pred)/ y_true )))

In [None]:
def base_model(tlen):
    
    inputs = Input(shape=(tlen, 1))

    x = Conv1D(16, 2, padding='same', activation='relu')(inputs)
    x = MaxPool1D(pool_size=2, padding='same')(x)

    x = Conv1D(10, 2, padding='same', activation='relu')(x)
    x = MaxPool1D(pool_size=2, padding='same')(x)

    x = Flatten()(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(1, activation='tanh')(x)

    model = Model(inputs, outputs=x)

    return model

In [None]:
model = base_model(tlen)
optimizer = Adam(lr=1e-3)

In [None]:
folds_valid_rmspe = []

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=2020)
counter = 1


es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=1e-05, patience=6, verbose=1,
    mode='min')

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=4, verbose=1, min_lr=5e-7,
    mode='min')

for dev_index, val_index in kf.split(range(len(X))):
    
    #試験的に見るために1foldで切る用の分岐
    if counter>1:
        pass #break
    
    model = base_model(tlen)
    
    model.compile(
        Adam(learning_rate=0.0004), #0.0004
        loss='mean_squared_error',
        metrics=[root_mean_squared_per_error],
    )
    
    model.fit(X[dev_index], 
              y[dev_index], 
              #sample_weight = 1/np.square(y[dev_index]),
              batch_size=256,
              epochs=2,
              validation_data=(X[val_index], y[val_index]), #1/np.square(y[val_index])
              callbacks=[es, plateau],
              shuffle=True,
             verbose = 1)
    
    preds = model.predict(X[val_index]).reshape(1,-1)[0]
    
    def rmspe(y_true, y_pred):
        y_pred = y_pred.reshape(-1,1)
        return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
    
    score = round(rmspe(y_true = y[val_index], y_pred = preds),5)
    print('Fold {} 1DCNN: {}'.format(counter, score))
    folds_valid_rmspe.append(score)
    print(len(preds))
    
    counter += 1
gc.collect()

In [None]:
#各foldのRMSPEのスコア
folds_valid_rmspe

In [None]:
model.summary()