In [None]:
import io
import json
import requests
import functools
import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils import data
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from torchvision import datasets, models, transforms
from queue import Queue

In [None]:
EPOCHS        = 5
DROPOUT       = 0.2
DIRECTIONS    = 1
NUM_LAYERS    = 2
BATCH_SIZE    = 5
OUTPUT_SIZE   = 1
SEQ_LENGTH    = 30

HIDDEN_SIZE   = 100 
LEARNING_RATE = 0.001
SHIFT_K       = 5  # feaute_1 ~ feaute_k
STATE_DIM     = NUM_LAYERS * DIRECTIONS, BATCH_SIZE, HIDDEN_SIZE
TARGET        = "Target"
FEATURES      = ['Close','High', 'Low', 'Open', 'VWAP', 'Volume']
# add feaute_1 ~ feaute_k
for i in range(1, SHIFT_K+1):
    FEATURES.append(f"feature_{i}")
NUM_FEATURES  = len(FEATURES)

In [None]:
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

## Load Data

In [None]:
# Start with 10k rows for testing
df_train = pd.read_csv('../input/g-research-crypto-forecasting/train.csv', nrows=10000000) #### train data
# df_train = pd.read_csv('../input/my_data/my_data.csv')
details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv') # asset_details
df_train.dropna(axis = 0, inplace = True) # dropna
df_train['Upper_Shadow'] = upper_shadow(df_train) # Upper_Shadow
df_train['Lower_Shadow'] = lower_shadow(df_train) # Lower_Shadow

In [None]:
asset_df_dict = {}

# split train data to 14 asset dataframe 
for asset_id in range(14):
    asset_df = df_train[df_train["Asset_ID"]==asset_id][:10000].reset_index(drop=True) #### debug just 10000 row
    # asset_df = df_train[df_train["Asset_ID"]==asset_id].reset_index(drop=True)
    asset_df_dict[asset_id] = asset_df
len(asset_df_dict.keys())

In [None]:
# get feaute_1 ~ feaute_k
def feature_k(df, k=SHIFT_K, target_variable="Target"):
    for i in range(1, k+1):
        df[f"feature_{i}"] = df[target_variable].shift(i)
    return df

target_dict = {}
for asset_id in range(14):
    # save target of last row 
    asset_df = feature_k(asset_df_dict[asset_id]).fillna(0)
    asset_df_dict[asset_id] = asset_df
    target_dict[asset_id] = Queue(maxsize=SHIFT_K+1) # queue
    last_row = asset_df.iloc[-1]
    for k in range(SHIFT_K-1, 0, -1):
        target_dict[asset_id].put(last_row[f"feature_{k}"])
    target_dict[asset_id].put(last_row["Target"])    

In [None]:
training_data_list = []
validation_data_list = []

# train_test_split for 14 asset
for i in range(14):
    training_data, validation_data = train_test_split(asset_df_dict[i], test_size=0.2, shuffle=False)
    training_data_list.append(training_data)
    validation_data_list.append(validation_data)

## Dataset

In [None]:
class CryptoDataset(Dataset):
    """Onchain dataset."""

    def __init__(self, csv_file, seq_length, features, target):
        """
        Args:
        """
        self.csv_file = csv_file
        self.target = target
        self.features = features
        self.seq_length = seq_length
        self.data_length = len(csv_file)

        self.metrics = self.create_xy_pairs()

    def create_xy_pairs(self):
        pairs = []
        for idx in range(self.data_length - self.seq_length):
            x = self.csv_file[idx:idx + self.seq_length][self.features].values
            y = self.csv_file[idx + self.seq_length:idx + self.seq_length + 1][self.target].values
            pairs.append((x, y))
        return pairs

    def __len__(self):
        return len(self.metrics)

    def __getitem__(self, idx):
        return self.metrics[idx]

In [None]:
params = {'batch_size': BATCH_SIZE,
          'shuffle': False,
          'drop_last': True, # Disregard last incomplete batch
          'num_workers': 2}

params_test = {'batch_size': 1,
          'shuffle': False,
          'drop_last': False, # Disregard last incomplete batch
          'num_workers': 2}

#  datasets and dataloader for 14 asset
training_ds_list = [CryptoDataset(training_data, SEQ_LENGTH, FEATURES, TARGET) for training_data in training_data_list]
training_dl_list = [DataLoader(training_ds, **params) for training_ds in training_ds_list]

validation_ds_list = [CryptoDataset(validation_data, SEQ_LENGTH, FEATURES, TARGET) for validation_data in validation_data_list]
validation_dl_list = [DataLoader(validation_ds, **params_test) for validation_ds in validation_ds_list]

## Model Settings

In [None]:
# Transfer to accelerator
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob, directions=1):
        super(LSTM, self).__init__()

        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.directions = directions

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
        self.dropout = nn.Dropout(dropout_prob)
        self.linear = nn.Linear(hidden_size, output_size)

    def init_hidden_states(self, batch_size):
        state_dim = (self.num_layers * self.directions, batch_size, self.hidden_size)
        return (torch.zeros(state_dim).to(device), torch.zeros(state_dim).to(device))

    def forward(self, x, states):
        x, (h, c) = self.lstm(x, states)
        out = self.linear(x)
        return out, (h, c)

In [None]:
# 14 models
models_list = [LSTM(NUM_FEATURES,HIDDEN_SIZE,NUM_LAYERS,OUTPUT_SIZE,DROPOUT).to(device) for _ in range(14)]
# 14 criterion
criterion_list = [nn.MSELoss() for _ in range(14)]
# 14 optimizer
optimizer_list = [optim.AdamW(model.linear.parameters(), lr=LEARNING_RATE, weight_decay=0.01) for model in models_list]

## Training

In [None]:
def save_checkpoint(epoch, min_val_loss, model_state, opt_state, asset_id):
    print(f"New minimum reached at epoch #{epoch + 1}, saving model state...")
    checkpoint = {
    'epoch': epoch + 1,
    'min_val_loss': min_val_loss,
    'model_state': model_state,
    'opt_state': opt_state,
    }
    torch.save(checkpoint, f"./model_state_{asset_id}.pt")


def load_checkpoint(path, model, optimizer):
    # load check point
    checkpoint = torch.load(path)
    min_val_loss = checkpoint["min_val_loss"]
    model.load_state_dict(checkpoint["model_state"])
    optimizer.load_state_dict(checkpoint["opt_state"])
    return model, optimizer, checkpoint["epoch"], min_val_loss


def training(asset_id, model, criterion, optimizer, epochs, validate_every=2):

    training_losses = []
    validation_losses = []
    min_validation_loss = np.Inf

    # Set to train mode
    model.train()

    for epoch in tqdm(range(epochs)):

        # Initialize hidden and cell states with dimension:
        # (num_layers * num_directions, batch, hidden_size)
        states = model.init_hidden_states(BATCH_SIZE)
        running_training_loss = 0.0

        # Begin training
        for idx, (x_batch, y_batch) in enumerate(training_dl_list[asset_id]):
            # Convert to Tensors
            x_batch = x_batch.float().to(device)
            y_batch = y_batch.float().to(device)

            # Truncated Backpropagation
            states = [state.detach() for state in states]          

            optimizer.zero_grad()

            # Make prediction
            output, states = model(x_batch, states)

            # Calculate loss
            loss = criterion(output[:, -1, :], y_batch)
            loss.backward()
            running_training_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

        # Average loss across timesteps
        training_losses.append(running_training_loss / len(training_dl_list[asset_id]))

        if epoch % validate_every == 0:

            # Set to eval mode
            model.eval()

            validation_states = model.init_hidden_states(BATCH_SIZE)
            running_validation_loss = 0.0

            for idx, (x_batch, y_batch) in enumerate(validation_dl_list[asset_id]):

                # Convert to Tensors
                x_batch = x_batch.float().to(device)
                y_batch = y_batch.float().to(device)

                validation_states = [state.detach() for state in validation_states]
                output, validation_states = model(x_batch, validation_states)
                validation_loss = criterion(output[:, -1, :], y_batch)
                running_validation_loss += validation_loss.item()

        validation_losses.append(running_validation_loss / len(validation_dl_list[asset_id]))
        # Reset to training mode
        model.train()

        is_best = running_validation_loss / len(validation_dl_list[asset_id]) < min_validation_loss

        if is_best:
            min_validation_loss = running_validation_loss / len(validation_dl_list[asset_id])
            save_checkpoint(epoch + 1, min_validation_loss, model.state_dict(), optimizer.state_dict(), asset_id)


    # Visualize loss
    epoch_count = range(1, len(training_losses) + 1)
    plt.plot(epoch_count, training_losses, 'r--')
    plt.legend(['Training Loss'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

    val_epoch_count = range(1, len(validation_losses) + 1)
    plt.plot(val_epoch_count, validation_losses, 'b--')
    plt.legend(['Validation loss'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [None]:
# Train
for asset_id, model in enumerate(models_list):
    print(f"Training asset {asset_id}")
    training(asset_id, model, criterion_list[asset_id], optimizer_list[asset_id], EPOCHS)

In [None]:
# load checkpoint
saved_model_list = []
for asset_id, model in enumerate(models_list):
    print(f"load_checkpoint asset {asset_id}")
    model, optimizer, start_epoch, valid_loss_min = load_checkpoint(f"./model_state_{asset_id}.pt", model, optimizer_list[asset_id])
    saved_model_list.append(model)
    # print("model = ", model)
    # print("optimizer = ", optimizer)
    print("valid_loss_min = ", valid_loss_min)
    print("valid_loss_min = {:.6f}".format(valid_loss_min))

### Submission

In [None]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

In [None]:
# get test data feature_1 ~ feature_k
def test_feature_k(df, k=SHIFT_K, target_variable="Target"):
    new_df = []
    for row in df.iterrows():
        row = row[1]
        row_asset_id = int(row.Asset_ID)
        asset_df = asset_df_dict[row_asset_id]
        target_q = target_dict[row_asset_id]
        for i in range(k):
            row[f"feature_{k-i}"] = target_q.queue[i]
        new_df.append(row)
    return pd.DataFrame(new_df).astype({'timestamp': 'int64', 'Asset_ID':'int8', 'Count':'int32', 'row_id':'int32'})

In [None]:
# (test_df, sample_prediction_df) = next(iter_test)
# test_df['Upper_Shadow'] = upper_shadow(test_df)
# test_df['Lower_Shadow'] = lower_shadow(test_df)
# test_df
# test_df = test_feature_k(test_df)
# test_df

In [None]:
# test_asset_id_list = test_df.Asset_ID.to_list()
# selected_features = test_df[FEATURES]
# x = torch.Tensor(selected_features.values)

In [None]:
# idx = 1
# asset_id = 3

# x = x[idx].unsqueeze(0)
# x = x.float().to(device)
# x = x.view(1, -1, NUM_FEATURES)
# model = saved_model_list[asset_id]
# model.eval()
# validation_states = model.init_hidden_states(1)
# validation_states = [state.detach() for state in validation_states]
# output, _ = model(x, validation_states)
# pred = output[:, -1, :].item()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    pred_list = []
    test_df['Upper_Shadow'] = upper_shadow(test_df) # test Upper_Shadow
    test_df['Lower_Shadow'] = lower_shadow(test_df) # test Lower_Shadow
    
    test_df = test_feature_k(test_df) # get test data feature_1 ~ feature_k
    test_asset_id_list = test_df.Asset_ID.to_list() # get asset_id_list

    selected_features = test_df[FEATURES]
    x_values = torch.Tensor(selected_features.values)
    for idx, asset_id in enumerate(test_asset_id_list):
        x = x_values[idx].unsqueeze(0)
        x = x.float().to(device)
        x = x.view(1, -1, NUM_FEATURES)
        model = saved_model_list[asset_id]
        model.eval()
        validation_states = model.init_hidden_states(1)
        validation_states = [state.detach() for state in validation_states]
        output, _ = model(x, validation_states)
        pred = output[:, -1, :].item()
        target_dict[asset_id].get()
        target_dict[asset_id].put(pred)
        pred_list.append(pred)
    sample_prediction_df['Target'] = pred_list
    env.predict(sample_prediction_df)