In [1]:
import os
import pandas as pd
import numpy as np 

import torch
from torch import nn as nn
from torch.nn import functional as F
from torch.utils.data import Subset,Dataset, DataLoader, random_split

import pytorch_lightning as pl 
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import Callback

from pytorch_forecasting.metrics import SMAPE

In [2]:
train_x = pd.read_csv('./Datasets/train_x_df.csv')
train_y = pd.read_csv('./Datasets/train_y_df.csv')

test_x = pd.read_csv('./Datasets/test_x_df.csv')

In [34]:
print(train_x['sample_id'].nunique())
print(train_x['time'].nunique())
print(train_x['coin_index'].nunique())

7362
1380
10


In [6]:
class Encoder(pl.LightningModule):
    def __init__(self, input_size, hidden_size, n_layers, dropout_p):
        super(Encoder, self).__init__()     
        self.rnn = nn.GRU(input_size = input_size, 
                          hidden_size = hidden_size, 
                          num_layers = n_layers, 
                          batch_first = True, 
                          dropout = dropout_p)
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_size).zero_()

    def forward(self, x):
        h_0 = self._init_state(batch_size=x.size(0))
        output, hidden = self.rnn(x, h_0)
        return output, hidden

In [7]:
class Decoder(pl.LightningModule):
    def __init__(self, hidden_size, target_size):
        super(Decoder, self).__init__()
        self.out = nn.Linear(hidden_size, target_size)

    def forward(self, enc_output):
        output = enc_output[:, -1, :]
        pred = self.out(output)
        pred = pred.squeeze()
        return pred

In [8]:
class Seq2seqLightningModule(pl.LightningModule):
    def __init__(self, hparams):
        super(Seq2seqLightningModule, self).__init__()
        self.hparams = hparams
        self.id_emb = nn.Embedding(self.hparams.n_id+1, self.hparams.embed_size)
        self.time_emb = nn.Embedding(self.hparams.n_time+1, self.hparams.embed_size)
        self.sec_emb = nn.Embedding(self.hparams.n_sec+1, self.hparams.embed_size)
        self.coin_emb = nn.Embedding(self.hparams.n_coin+1, self.hparams.embed_size)
        
        self.encoder = Encoder(self.hparams.input_size, self.hparams.hidden_size, self.hparams.n_layers, self.hparams.dropout_p)
        self.decoder = Decoder(self.hparams.hidden_size, self.hparams.target_size)

        self.layer_norm = nn.LayerNorm(self.hparams.input_size)
        
    def forward(self, x_id, x_time, x_sec, x_coin, x_values):
        embed_x = self.id_emb(x_id) + self.time_emb(x_time) + self.coin_emb(x_coin) + self.sec_emb(x_sec) # [batch_size, input_dim, embed_size]
        embed_x = torch.mean(embed_x, axis=-1) # [batch_size, input_dim]
        x = embed_x + x_values
        
        x = self.layer_norm(x)
        
        enc_output, _ = self.encoder(x.unsqueeze(1))
        pred = self.decoder(enc_output)
        return pred


    def training_step(self, batch, batch_idx):
        x_id, x_time, x_coin, x_sec = batch['id'], batch['time'], batch['coin'], batch['sec_10']
        x_values = batch['open_val']
        y = batch['labels']
        y_hat = self(x_id, x_time, x_sec, x_coin, x_values)
        loss = F.mse_loss(y_hat, y)
        self.log('trn_loss', loss, on_step=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x_id, x_time, x_coin, x_sec = batch['id'], batch['time'], batch['coin'], batch['sec_10']
        x_values = batch['open_val']
        y = batch['labels']
        y_hat = self(x_id, x_time, x_sec, x_coin, x_values)
        loss = F.mse_loss(y_hat, y)
        self.log('val_loss', loss, on_step=True)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
        return optimizer

In [28]:
n_id = 7362
n_time = 1380+119
n_coin = 10
n_sec = 138+11

In [29]:
import easydict 

h_params = easydict.EasyDict({'n_id': n_id,
                              'n_time': n_time,
                              'n_sec': n_sec,
                              'n_coin': n_coin,
                              'input_size': 5,
                              'embed_size': 32,
                              'hidden_size': 64,
                              'target_size': 1,
                              'n_layers': 2,
                              'dropout_p': 0.2
                             })

In [30]:
model = Seq2seqLightningModule(h_params)

In [42]:
class BitcoinDataset(Dataset):
    def __init__(self, df, ws):
        self.df = df
        self.ws = ws
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        start_idx = index
        end_idx = index+self.ws
        
        sample_id = self.df.iloc[start_idx:end_idx, 0].values
        time = self.df.iloc[start_idx:end_idx, 1].values
        coin = self.df.iloc[start_idx:end_idx, 2].values
        open_val = self.df.iloc[start_idx:end_idx, 3].values
        sec_10 = self.df.iloc[start_idx:end_idx, -1].values
        
        labels = self.df.iloc[end_idx, 3]
        return {
            'id': torch.tensor(sample_id, dtype=torch.long),
            'time': torch.tensor(time, dtype=torch.long),
            'sec_10': torch.tensor(sec_10, dtype=torch.long),
            'coin': torch.tensor(coin, dtype=torch.long),
            'open_val': torch.tensor(open_val, dtype=torch.float32),
            'labels': torch.tensor(labels, dtype=torch.float32)
        }

In [11]:
class BitcoinDataModule(pl.LightningDataModule):
    def __init__(self, window_size, batch_size):
        super().__init__()
        self.ws = window_size
        self.batch_size = batch_size
        
    def setup(self,stage=None):
        train_x = pd.read_csv('./Datasets/train_x_df.csv')
        train_y = pd.read_csv('./Datasets/train_y_df.csv')
        train_y['time']+=1380
    
        train = pd.concat([train_x, train_y], axis=0)
        train = train.sort_values(by=['sample_id', 'time'])
        train['sec_10'] = train['time'].apply(lambda x: x//10)
        
        trn_subset_idx = train.loc[train['time']<=1439].index # 0~24hour
        val_subset_idx = train.loc[(train['time']>1439) & (train['time']<1500-self.ws)].index #25hour
        
        dataset=BitcoinDataset(train, ws=self.ws)
        
        self.train_dataset = Subset(dataset, trn_subset_idx)
        self.valid_dataset = Subset(dataset, val_subset_idx)
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          num_workers=4)
    
    def val_dataloader(self):
        return DataLoader(self.valid_dataset,
                          batch_size=self.batch_size,
                          num_workers=4)

In [12]:
bitcoin_dm = BitcoinDataModule(5, 2048)

In [13]:
bitcoin_dm.setup()

In [14]:
model_checkpoint = ModelCheckpoint(monitor = "val_loss",
                                   verbose=True,
                                   filename="{epoch}_{val_loss:.4f}")

early_stop_callback = EarlyStopping(monitor='val_loss', 
                                    patience=2, 
                                    verbose=True, 
                                    mode='min')

In [15]:
SEED = 42
EPOCHS = 10
device = "cuda" if torch.cuda.is_available() else "cpu"
pl.seed_everything(SEED)

Global seed set to 42


42

In [16]:
trainer = pl.Trainer(gpus=1, 
                     max_epochs=EPOCHS,
                     callbacks=[model_checkpoint, early_stop_callback]
                    )

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [17]:
trainer.fit(model, bitcoin_dm)


  | Name       | Type      | Params
-----------------------------------------
0 | id_emb     | Embedding | 235 K 
1 | time_emb   | Embedding | 48.0 K
2 | sec_emb    | Embedding | 4.8 K 
3 | coin_emb   | Embedding | 352   
4 | encoder    | Encoder   | 38.6 K
5 | decoder    | Decoder   | 65    
6 | layer_norm | LayerNorm | 10    
-----------------------------------------
327 K     Trainable params
0         Non-trainable params
327 K     Total params
1.310     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 5176: val_loss reached 0.00095 (best 0.00095), saving model to "/home/yyeon/KeepGo/My-Competition-Struggle/Dacon/BitCoin/lightning_logs/version_20/checkpoints/epoch=0_val_loss=0.0010.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 10353: val_loss reached 0.00074 (best 0.00074), saving model to "/home/yyeon/KeepGo/My-Competition-Struggle/Dacon/BitCoin/lightning_logs/version_20/checkpoints/epoch=1_val_loss=0.0007.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, step 15530: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 20707: val_loss reached 0.00074 (best 0.00074), saving model to "/home/yyeon/KeepGo/My-Competition-Struggle/Dacon/BitCoin/lightning_logs/version_20/checkpoints/epoch=3_val_loss=0.0007.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 25884: val_loss reached 0.00073 (best 0.00073), saving model to "/home/yyeon/KeepGo/My-Competition-Struggle/Dacon/BitCoin/lightning_logs/version_20/checkpoints/epoch=4_val_loss=0.0007.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 31061: val_loss reached 0.00072 (best 0.00072), saving model to "/home/yyeon/KeepGo/My-Competition-Struggle/Dacon/BitCoin/lightning_logs/version_20/checkpoints/epoch=5_val_loss=0.0007.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 36238: val_loss reached 0.00071 (best 0.00071), saving model to "/home/yyeon/KeepGo/My-Competition-Struggle/Dacon/BitCoin/lightning_logs/version_20/checkpoints/epoch=6_val_loss=0.0007.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 41415: val_loss reached 0.00070 (best 0.00070), saving model to "/home/yyeon/KeepGo/My-Competition-Struggle/Dacon/BitCoin/lightning_logs/version_20/checkpoints/epoch=7_val_loss=0.0007.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, step 46592: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, step 51769: val_loss was not in top 1


1

In [3]:
MODEL_PATH = './lightning_logs/version_20/checkpoints/epoch=7_val_loss=0.0007.ckpt'
HPARAM_PATH = './lightning_logs/version_20/hparams.yaml'

In [46]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [48]:
model = Seq2seqLightningModule.load_from_checkpoint(checkpoint_path=MODEL_PATH,
                                                    hparams_file = HPARAM_PATH)
model = model.to(device)
model.eval()
model.freeze()

RuntimeError: CUDA error: device-side assert triggered

In [42]:
model.hparams['on_gpu']=True

In [10]:
test_x['sec_10'] = test_x['time'].apply(lambda x: x//10)
test_x = test_x.groupby('sample_id').tail(5)

In [11]:
test_x = test_x.reset_index(drop=True)

In [12]:
class BitcoinTestDataset(Dataset):
    def __init__(self, df, ws):
        self.df = df
        self.ws = ws
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        start_idx = index
        end_idx = index+self.ws
        
        sample_id = self.df.iloc[start_idx:end_idx, 0].values
        time = self.df.iloc[start_idx:end_idx, 1].values
        coin = self.df.iloc[start_idx:end_idx, 2].values
        open_val = self.df.iloc[start_idx:end_idx, 3].values
        sec_10 = self.df.iloc[start_idx:end_idx, -1].values
        return {
            'id': torch.tensor(sample_id, dtype=torch.long),
            'time': torch.tensor(time, dtype=torch.long),
            'sec_10': torch.tensor(sec_10, dtype=torch.long),
            'coin': torch.tensor(coin, dtype=torch.long),
            'open_val': torch.tensor(open_val, dtype=torch.float32),
        }

In [17]:
test_ds = BitcoinTestDataset(test_x, 5)
test_subset_idx = [i for i in test_x.index if i%5==0]
        
test_dataset = Subset(test_ds, test_subset_idx)

In [18]:
test_dataloader= DataLoader(test_dataset,
                            batch_size=32,
                            num_workers=4)

In [43]:
result = []
print(device)
for step, batch in enumerate(test_dataloader):
    x_id, x_time, x_coin, x_sec, x_values =  batch['id'].to(device), batch['time'].to(device), batch['coin'].to(device), batch['sec_10'].to(device)
    x_values = batch['open_val'].to(device)
    y_hat = model(x_id, x_time, x_coin, x_sec, x_values)
    result.extend(y_hat.cpu().detach().numpy().tolist())

cuda


RuntimeError: CUDA error: device-side assert triggered

In [38]:
len(set(train_x['sample_id'].values) -  set(test_x['sample_id'].values))

6833

In [39]:
train_x['sample_id'].nunique()

7362

In [40]:
test_x['sample_id'].nunique()

529

In [35]:
test_x.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,sec_10
0,0,1375,7,0.997398,0.999115,0.997371,0.998256,411115.0,152972.4375,324.623047,221016.265625,82244.789062,137
1,0,1376,7,0.998256,0.999624,0.998256,0.998873,656922.5,244636.890625,448.033478,395109.75,147137.734375,137
2,0,1377,7,0.999168,0.999839,0.999168,0.999785,328069.96875,122220.28125,222.675323,215481.84375,80277.742188,137
3,0,1378,7,0.999624,1.00051,0.999598,1.0,301314.875,112317.3125,321.940216,130979.765625,48832.816406,137
4,0,1379,7,1.0,1.000322,0.9989,1.000322,403639.25,150405.3125,313.891724,215691.625,80365.726562,137


In [36]:
len(set(train_x['coin_index'].values) -  set(test_x['coin_index'].values))

0