In [1]:
import sys
from pathlib import Path

main_path = Path('..').resolve()
sys.path.append(str(main_path))

from src.dataset import MetaStockDataset
from src.utils import ARGProcessor
import numpy as np

In [2]:
setting_file = Path('.') / 'kdd.yml'

meta_args = ARGProcessor(setting_file=setting_file)
data_kwargs = meta_args.get_args(cls=MetaStockDataset)

In [3]:
meta_train = MetaStockDataset(meta_type='train', **data_kwargs)
meta_test1 = MetaStockDataset(meta_type='test1', **data_kwargs)
meta_test2 = MetaStockDataset(meta_type='test2', **data_kwargs)
meta_test3 = MetaStockDataset(meta_type='test3', **data_kwargs)


Processing data and candidates for train: 100%|██████████| 40/40 [00:02<00:00, 14.99it/s]
Processing data and candidates for test1: 100%|██████████| 40/40 [00:01<00:00, 34.87it/s]
Processing data and candidates for test2: 100%|██████████| 10/10 [00:00<00:00, 14.33it/s]
Processing data and candidates for test3: 100%|██████████| 10/10 [00:00<00:00, 28.50it/s]


In [4]:
test_tasks = meta_test1.generate_all()

In [5]:
tasks = test_tasks[5]

In [6]:
batch_data = meta_test1.map_to_tensor(tasks, device='cpu')

In [6]:
meta_test1.init_data()

In [9]:
import torch

In [11]:
loader = torch.utils.data.DataLoader(meta_test1, batch_size=32)

In [15]:
a = {1: 2, 3: 4}
a.pop(1)

2

In [16]:
a

{3: 4}

In [12]:
for x in loader:
    break

In [14]:
x['support'].size()

torch.Size([32, 5, 11])

In [8]:
from collections import defaultdict
from src.dataset import flatten

In [9]:
all_tasks = defaultdict()
for window_size in meta_test1.window_sizes:
    tasks = defaultdict(list)
    for symbol in meta_test1.symbols:
        data = meta_test1.generate_support_query(symbol, window_size)
        for k, v in data.items():
            tasks[k].extend(v)

In [11]:
len(tasks['support'])

3702

In [48]:
all_tests = defaultdict()
for window_size in meta_train.window_sizes:
    tasks = defaultdict(list)
    for symbol in meta_train.symbols:
        df_stock = meta_train.data[symbol]
        labels_indices = meta_train.candidates[symbol]
        y_test_end = labels_indices[labels_indices >= window_size]
        y_test_start = y_test_end - window_size
        inputs, labels = meta_train.generate_data(df_stock, y_test_start, y_test_end)

        y_s = np.array(sorted(np.random.choice(labels_candidates, size=(self.n_sample,), replace=False)))
        y_ss = y_s-window_size
        support, support_labels = self.generate_data(df_stock, y_start=y_ss, y_end=y_s)
        
        # code for jumpped tags like [1(support), 0, 0, 1(query)]
        # y_q = labels_indices[np.arange(len(labels_indices))[np.isin(labels_indices, y_s)] + self.n_lag]
        y_q = y_s + self.n_lag
        y_qs = y_s - window_size if self.keep_support_history else y_q - window_size
        query, query_labels = self.generate_data(df_stock, y_start=y_qs, y_end=y_q)
        tasks['inputs'].extend(inputs)
        tasks['labels'].extend(labels)
    tasks['inputs'] = np.array(tasks['inputs'])
    tasks['labels'] = np.array(tasks['labels'])
    all_tests[window_size] = tasks

In [47]:
import torch

In [53]:
a = meta_train.map_to_tensor(tasks)

In [56]:
a['inputs']

torch.Size([35752, 5, 11])

In [70]:
ds = torch.utils.data.TensorDataset(a['inputs'], a['labels'])

In [62]:
loader = torch.utils.data.DataLoader(
    ds, batch_size=8
)

In [63]:
for x, y in loader:
    break

In [65]:
x.shape

torch.Size([8, 5, 11])

In [66]:
y.shape

torch.Size([8])

In [50]:
tasks['inputs'].shape

(35752, 5, 11)

In [46]:
np.array(tasks['labels']).shape

(35752,)

In [18]:
y_ss

array([   3,    6,    7, ..., 1973, 1977, 1978])

In [21]:
inputs, labels = meta_train.generate_data(df_stock, y_test_start, y_test_end)

In [25]:
len(inputs)

1055

In [26]:
len(labels)

1055

In [3]:
data_dir = data_kwargs['data_dir']
dtype = data_kwargs['dtype']
n_train_stock = data_kwargs['n_train_stock']

KeyError: 'n_train_stock'

In [46]:
data_dir = Path(data_dir).resolve()
ds_info = {
    # train: (Jan-01-2007 to Jan-01-2015)
    # val: (Jan-01-2015 to Jan-01-2016)
    # test: (Jan-01-2016 to Jan-01-2017)
    'kdd17': {
        'path': data_dir / 'kdd17/price_long_50',
        'date': data_dir / 'kdd17/trading_dates.csv',
        'train_date': '2015-01-01', 
        'val_date': '2016-01-01', 
        'test_date': '2017-01-01',
    },
    # train: (Jan-01-2014 to Aug-01-2015)
    # vali: (Aug-01-2015 to Oct-01-2015)
    # test: (Oct-01-2015 to Jan-01-2016)
    'acl18': {
        'path': data_dir / 'stocknet-dataset/price/raw',
        'date': data_dir / 'stocknet-dataset/price/trading_dates.csv',
        'train_date': '2015-08-01', 
        'val_date': '2015-10-01', 
        'test_date': '2016-01-01',
    }
}
ds_config = ds_info[dtype]

meta_type = 'train'
window_sizes = [5] # [5, 10, 15, 20]

# get data
data = {}
candidates = {}
ps = list((data_dir / ds_config['path']).glob('*.csv'))
# iterator = ps[:n_train_stock] if (meta_type == 'train') or (meta_type == 'test1') else ps[n_train_stock:]


In [47]:
n_stocks = len(ps)
print(n_stocks)

50


In [48]:
seeds = [3, 7, 11, 69, 81]
all_selected = []
all_not_selected = []
for s in seeds:
    np.random.seed(s)
    selected = np.random.choice(ps, size=int(n_stocks*0.8), replace=False)
    not_selected = np.array(ps)[~np.isin(ps, selected)]
    all_selected.append([s.name.strip('.csv') for s in selected])
    all_not_selected.append([s.name.strip('.csv') for s in not_selected])

In [49]:
import json

d = {}

for i, s in enumerate(seeds):
    d[i] = {
        'seed': s, 'known': all_selected[i], 'unknown': all_not_selected[i]
    }
with Path('stock_universe.json').open('w', encoding='utf-8') as file:
    json.dump(d, file)