### 导入需要的库

In [None]:
import numpy as np
import pandas as pd
import glob # Read File Routes
from tqdm import tqdm
import sys, os

In [None]:
reorganize_data_by_time("test","trade","0")

### 定义常量

In [None]:
DATA_ROOT = "/kaggle/input/optiver-realized-volatility-prediction"
DATA_FEATURE_COUNT = 10
BATCH_SIZE = 1
FEATURES_COUNT = 14

### 导入数据并进行预处理

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

In [None]:
def reorganize_data_by_time(mode,data_type,stock="*",path=DATA_ROOT):
    # data_type = trade / book
    # mode = test / train
    book_paths = glob.glob(f'{path}/{data_type}_{mode}.parquet/stock_id={stock}/*')
    for path in tqdm(book_paths,desc = "Reading From parquet:"):
        stock_id = int(path.split("=")[1].split("/")[0])
        book_df = pd.read_parquet(path)
        books_by_time = dict()

        for time_id in book_df.time_id.unique():
            books_by_time[time_id] = book_df[book_df["time_id"] == time_id].reset_index(drop=True).drop("time_id",axis=1)

    return books_by_time
    

def load_predict_data(mode, path=DATA_ROOT):
    if mode != "train" and mode != "test":
        raise OSError
    file_name = f'{path}/{mode}.csv'
    return pd.read_csv(file_name)

In [None]:

for idx , time_id in tqdm(enumerate(trade_data),desc = "Organizing Data by time",total = len(trade_data)):
    stock_data = pd.merge(trade_data[time_id],book_data[time_id],how = 'outer').sort_values(by=['seconds_in_bucket']).fillna(0)
    stock_data['wap'] = (stock_data['bid_price1'] * stock_data['ask_size1'] +
                            stock_data['ask_price1'] * stock_data['bid_size1']) / (
                                   stock_data['bid_size1']+ stock_data['ask_size1'])
    stock_data["log_return"] = log_return(stock_data['wap'])
    stock_data.fillna(0,inplace = True)
    # 开始填充数据
    pad_data = np.zeros(shape=(800, FEATURES_COUNT))
    print(stock_data.shape)
    for index,data in stock_data.iterrows():
        pad_data[index] = np.array(data)
    # print(pad_data.shape)
    stock_all_time_data[idx] = pad_data

## 组建DataLoader

In [None]:
from torch.utils.data import Dataset
from torch import tensor

class TrainDataset(Dataset):
    def __init__(self,mode = "train"):
        self.mode = mode
        self.target = load_predict_data(self.mode)
        self.length = self.target.max()["stock_id"]
        self.max_seq_len = 800
        self.max_stock_len = 8000
#         self.trade_data = reorganize_data_by_time(self.mode,'trade')
#         self.book_data = reorganize_data_by_time(self.mode,'book')
        # 这两个不能预先加载 因为太大了，到时候根据stockid加载
    # TODO: Logger
    
    def __getitem__(self,stock_id):
        trade_data = reorganize_data_by_time(self.mode,"trade",str(stock_id))
        book_data = reorganize_data_by_time(self.mode,"book",str(stock_id))
        stock_data_lengths = []
        label = self.target.loc[stock_id, "target"]
        
        stock_all_time_data = np.zeros(shape=(self.max_stock_len,self.max_seq_len,FEATURES_COUNT))
        for idx , time_id in tqdm(enumerate(trade_data),desc = "Organizing Data by time",total = len(trade_data)):
            stock_data = pd.merge(trade_data[time_id],book_data[time_id],how = 'outer').sort_values(by=['seconds_in_bucket']).fillna(0)
            stock_data['wap'] = (stock_data['bid_price1'] * stock_data['ask_size1'] +
                                    stock_data['ask_price1'] * stock_data['bid_size1']) / (
                                           stock_data['bid_size1']+ stock_data['ask_size1'])
            stock_data["log_return"] = log_return(stock_data['wap'])
            stock_data.fillna(0,inplace = True)
            stock_data_lengths.append(stock_data.shape[0])
            # 开始填充数据
            pad_data = np.zeros(shape=(self.max_seq_len, FEATURES_COUNT))
            for index,data in stock_data.iterrows():
                pad_data[index] = np.array(data)
            # print(pad_data.shape)
            stock_all_time_data[idx] = pad_data
        return {"data":tensor(stock_all_time_data), "label":tensor(label) ,'seq_len': tensor(stock_data_lengths)} # data : (time,seconds,data)
        # return [tensor(stock_all_time_data), tensor(label) ,len(trade_data)]# data : (time,seconds,data)
     
    def __len__(self):
        return self.length.astype(np.int16)

### 组建DataLoader

In [None]:
from torch.utils.data import DataLoader
train_data = DataLoader(TrainDataset("train"),batch_size = BATCH_SIZE ,shuffle = False)

In [None]:
aa = DataLoader(TrainDataset(mode="test"),batch_size = 1 ,shuffle = False)

In [None]:
for i in aa:
    print(i)
    break

### 组建PackedSequence

In [None]:
np.array([1, 4,2,13])[np.argsort(np.array([1,4, 2,13]))[::-1]]

In [None]:
i["seq_len"][0][order_idx[0]]

In [None]:
order_idx[0]

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence
for batch in i['data']:
    order_idx = np.array(np.argsort(i["seq_len"]))[::-1]
    print('order_idx:', str(order_idx))
    order_x = batch[order_idx.tolist()]
    order_seq = np.array(i["seq_len"][0][order_idx[0]])
    # Pack it
    pack = pack_padded_sequence(order_x, order_seq, batch_first=True ,enforce_sorted=False)
    i['data '] = pack

### 实例化RNN网络

In [None]:
from torch.nn import RNN
rnn = RNN(input_size=14, hidden_size=1, num_layers=20)
criterion = nn.CrossEntropyLoss()
optimzier = torch.optim.Adadelta(net.parameters(), 1e-1)

def get_acc(output, label):
    total = output.shape[0]
    _, pred_label = output.max(1)
    num_correct = (pred_label == label).sum().data
    # print(num_correct, total)
    return num_correct

def train(net, train_data, valid_data, num_epochs, optimizer, criterion):
    if torch.cuda.is_available():
        net = net.cuda()
    for i in range(num_epochs):
        train_loss = 0
        train_acc = 0
        net = net.train()
        for im, label in train_data:
            if torch.cuda.is_available():
                im = Variable(im.cuda())
                label = Variable(label.cuda())
            else:
                im = Variable(im)
                label = Variable(label)
            # forward
            output = net(im)
            total = output.shape[0]
            loss = criterion(output, label)
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.data.cpu().numpy()/float(total)
            train_acc += get_acc(output, label).cpu().numpy()/float(total)
        if valid_data is not None:
            valid_loss = 0
            valid_acc = 0
            net = net.eval()
            for im, label in valid_data:
                if torch.cuda.is_available():
                    im = Variable(im.cuda(), volatile=True)
                    label = Variable(label.cuda(), volatile=True)
                else:
                    im = Variable(im, volatile=True)
                    label = Variable(label, volatile=True)
                output = net(im)
                total = output.shape[0]
                loss = criterion(output, label)
                valid_loss += loss.data.cpu().numpy()/float(total)
                valid_acc += get_acc(output, label).cpu().numpy()/float(total)
            print("epoch: %d, train_loss: %f, train_acc: %f, valid_loss: %f, valid_acc:%f"
                  % (i, train_loss/len(train_data),  train_acc/len(train_data),
                  valid_loss/len(valid_data),  valid_acc/len(valid_data)))

        else:
            print("epoch= ", i, "train_loss= ", train_loss/len(train_data), "train_acc= ", train_acc/len(train_data))
# 开始训练
train(net, train_data, test_data, 10, optimzier, criterion)