In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import gc
from libs.utils import try_gpu, QuantileLoss
from libs.tft_model import TFT

split train set and test set

In [2]:
data_tick = pd.read_hdf('../data/data_tick_2months.h5', key='tick_data')
data_stock = data_tick.xs('688303.XSHG', level='order_book_id') #! 取第x只股票
date_list = data_stock.trading_date.unique() # 取出所有交易日的date

df_tick_test = data_stock[data_stock['trading_date'] == date_list[-1]]  # 取最后一天数据为测试集
df_tick_train = data_stock[data_stock['trading_date'] != date_list[-1]] # 其他交易日数据为测试集

data_day = pd.read_hdf('../data/data_20days.h5', key='day_data')
data_stock = data_day.xs('688303.XSHG', level='order_book_id') #! 取第x只股票
day_list = data_stock.index.unique() # 取出所有交易日的date

# df_day_test = data_stock[(data_stock['date'] >= day_list[-21]) & (data_stock['date'] < day_list[-1])]  
# df_day_train = data_stock[data_stock['date'] < day_list[-2]] 
df_day_test = data_stock.loc[day_list[-21]: day_list[-1]]
df_day_train = data_stock.loc[: day_list[-2]]

resample tick data in 3s pieces

In [3]:
df_tick_list = []
for i in range(len(date_list)-1):
    data_piece = df_tick_train[df_tick_train['trading_date'] == date_list[i]]
    data_piece = data_piece.resample('3S', origin=f'{date_list[i].date()} 09:30:00').ffill()
    locs1 = data_piece.index.indexer_between_time('09:30:00', '11:30:00', include_start=True,include_end=True)
    locs2 = data_piece.index.indexer_between_time('13:00:00', '15:00:00', include_start=True,include_end=True)
    data_piece = pd.concat((data_piece.iloc[locs1], data_piece.iloc[locs2]), axis=0)
    data_piece[['volume', 'total_turnover', 'num_trades']] = data_piece[['volume', 'total_turnover', 'num_trades']].diff().bfill()
    df_tick_list.append(data_piece)
df_tick_train = pd.concat(df_tick_list, axis=0)

df_tick_test = df_tick_test.resample('3S', origin=f'{date_list[-1].date()} 09:30:00').ffill()
locs1 = df_tick_test.index.indexer_between_time('09:30:00', '11:30:00', include_start=True,include_end=True)
locs2 = df_tick_test.index.indexer_between_time('13:00:00', '15:00:00', include_start=True,include_end=True)
df_tick_test = pd.concat((df_tick_test.iloc[locs1], df_tick_test.iloc[locs2]), axis=0)
df_tick_test[['volume', 'total_turnover', 'num_trades']] = df_tick_test[['volume', 'total_turnover', 'num_trades']].diff().bfill()

gc.collect()

0

Dataset and DataLoader

In [4]:
class Dataset_train(Dataset):
    def __init__(self, df_tick_train, df_day_train):
        #self.time_steps = time_steps
        self.df_tick = df_tick_train
        self.date_list = df_tick_train.trading_date.unique()
        self.df_day = df_day_train

    def __getitem__(self, index):
        day_num = index // 4802

        static_feats_numeric = self.df_day.iloc[day_num: day_num+20, :]
        static_feats_numeric = static_feats_numeric.values.flatten()
        static_feats_numeric = torch.tensor(static_feats_numeric, dtype=torch.float32)

        historical_ts_numeric = self.df_tick[self.df_tick['trading_date'] == self.date_list[day_num]]
        historical_ts_numeric = historical_ts_numeric.drop(columns=['trading_date'])
        num = index % (4802 - 5*20*3 - 20*3 + 1)
        label = (historical_ts_numeric.iloc[num+5*20*3+20*3-1]['last'] - historical_ts_numeric.iloc[num+5*20*3]['last'])/historical_ts_numeric.iloc[num+5*20*3]['last']
        label = torch.tensor(label, dtype=torch.float32)
        historical_ts_numeric = historical_ts_numeric.iloc[num:num+5*20*3, :]
        historical_ts_numeric = torch.tensor(historical_ts_numeric.values, dtype=torch.float32)
        
        return static_feats_numeric, historical_ts_numeric, label

    def __len__(self):
        return len(self.date_list) * (4802 - 5*20*3 - 20*3 + 1)


def LoadData_train(df_tick_train, df_day_train, batch_size, shuffle=False, num_workers=0):
    dataset = Dataset_train(df_tick_train, df_day_train)
    return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

In [5]:
class Dataset_test(Dataset):
    def __init__(self, df_tick_test, df_day_test):
        # self.time_steps = time_steps
        self.df_tick = df_tick_test
        self.df_day = df_day_test       

    def __getitem__(self, index):
        static_feats_numeric = self.df_day.values.flatten()
        static_feats_numeric = torch.tensor(static_feats_numeric, dtype=torch.float32)

        historical_ts_numeric = self.df_tick
        label = (historical_ts_numeric.iloc[index+5*20*3+20*3-1]['last'] - historical_ts_numeric.iloc[index+5*20*3]['last'])/historical_ts_numeric.iloc[index+5*20*3]['last']
        label = torch.tensor(label, dtype=torch.float32)
        historical_ts_numeric = historical_ts_numeric.iloc[index:index+5*20*3, :]
        historical_ts_numeric = torch.tensor(historical_ts_numeric.values, dtype=torch.float32)

        return static_feats_numeric, historical_ts_numeric, label

    def __len__(self):
        return self.df_tick.shape[0] - 5*20*3 - 20*3 + 1


def LoadData_test(df_tick_test, df_day_test, batch_size, shuffle=False, num_workers=0):
    dataset = Dataset_test(df_tick_test, df_day_test)
    return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

Train the model

In [6]:
config = {
    "data_props": {
        "num_historical_numeric": 31,
        "num_historical_categorical": 0,
        "historical_categorical_cardinalities": [],
        "num_static_numeric": 200,
        "num_static_categorical": 0,
        "static_categorical_cardinalities": [],
        #"num_future_numeric": 0,
        #"num_future_categorical": 0,
        #"future_categorical_cardinalities": [],
    },
    "model": {
        "attention_heads": 1,
        "dropout": 0.3,
        "lstm_layers": 2,
        "output_quantiles": [0.5], # [0.1, 0.5, 0.9],
        "state_size": 256
    },
    "task_type": 'regression',
    "target_window_start": None,
}

model = TFT(config)
device = try_gpu(i=0)
model.to(device)

loss = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
num_epochs = 30
batch_size = 64 * 3
data_iter = LoadData_train(df_tick_train, df_day_train, batch_size, shuffle=False, num_workers=0)

In [7]:
for epoch in tqdm(range(num_epochs)):
    running_loss = 0.0
    for static_feats_numeric, historical_ts_numeric, label in data_iter:
        batch = {
            'static_feats_numeric': static_feats_numeric,  # 静态数值特征，形状：[num_samples x num_static_numeric]
            'historical_ts_numeric': historical_ts_numeric,  # 历史数值时间序列，形状：[num_samples x num_historical_steps x num_historical_numeric]
            'static_feats_categorical': torch.empty(1),
            'historical_ts_categorical': torch.empty(1),
        }
        for k in batch.keys():
            batch[k] = batch[k].to(device)
        label = label.to(device)

        optimizer.zero_grad()
        output = model(batch)
        output = output['predicted_quantiles']

        l = loss(output, label)

        l.backward()
        optimizer.step()

        running_loss += l.item()
    
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {(running_loss / len(data_iter)):.16f}")

  3%|▎         | 1/30 [28:32<13:47:39, 1712.40s/it]

Epoch [1/30], Loss: nan


  3%|▎         | 1/30 [30:46<14:52:32, 1846.63s/it]


KeyboardInterrupt: 

Test the model / Inference