In [None]:
# importing libraries

import tensorflow as tf
import numpy as np
import pandas as pd
import glob

In [None]:
# defining params

# W must be less than or equal to CorrW
W = 20
CorrW = 50
F = 5 

M = 0
N = 0

# 0-DateTime, 1-Close, 2-Open, 3-High, 4-Low, 5-Vol
SD = 0

# 0-DateTime, 1-Close, 2-Open
MD = 1

In [None]:
# files and dirs

raw_stocks = './prepared_data/stocks.npy'
raw_markets = './prepared_data/markets.npy'
raw_data = './prepared_data/stocks_correlations_markets_{}.npy'
dataset_file = './prepared_data/dataset_{}.npy'

In [None]:
stocks = np.load(raw_stocks, allow_pickle=True)
markets = np.load(raw_markets, allow_pickle=True)

print(stocks.shape)
print(markets.shape)

M = markets.shape[0]
N = stocks.shape[0]

In [None]:
"""
This section holds the definition of some utility functions which we use later.
"""

def build_corr_list(stock, market, CorrW):
    market_corr = []
    send = stock.shape[0] 
    mend = market.shape[0]
    while (mend >= CorrW) and (send >= CorrW):
        s_window = stock[send - CorrW : send]
        m_window = market[mend - CorrW : mend]
        corr_matrix = np.corrcoef(s_window, m_window)
        corr = corr_matrix[0, 1]
        market_corr.append(corr)
        send -= 1
        mend -= 1
    return market_corr[::-1]

def trim_2d_list(lst):
    min_length = min([len(x) for x in lst])
    for i in range(len(lst)):
        lst[i] = lst[i][-min_length:]

In [None]:
all_data = []

for i in range(257, N):
    # Build a 2D list contatining all the correlations between this stock and all the markets.
    markets_corr = [] 
    for j in range(M):
        corr_list = build_corr_list(stocks[i][:, SD], markets[j][:, MD], CorrW)
        markets_corr.append(corr_list)
    trim_2d_list(markets_corr)

    # Trimming this stock and all the markets
    X = len(markets_corr[0])
    stock_close = stocks[i][-X:, 1]
    stock_open = stocks[i][-X:, 2]
    stock_high = stocks[i][-X:, 3]
    stock_low = stocks[i][-X:, 4]
    stock_vol = stocks[i][-X:, 5]
    trim_markets = []
    for j in range(M):
        trim_markets.append(markets[j][-X:, MD])

    # Concatenate all the previous parts
    data = [stock_close, stock_open, stock_high, stock_low, stock_vol, *markets_corr, *trim_markets]

    print('{} of {} done'.format(i + 1, N))
    np.save(raw_data.format(i + 1), data)

In [None]:
data, labels = [], []
for i in range(1, 573):
    index = 0
    raw = np.load(raw_data.format(i))
    if len(raw) < W:
        continue
    while index + W + F < raw.shape[1]:
        x = np.array(raw[:, index:index+W])
        y = np.array(raw[0, index+W:index+W+F])
        data.append(x)
        labels.append(y)
        index += 1
    print('{} done.'.format(i))
    if (i % 50 == 0) or (i == 572):
        data = np.array(data)
        labels = np.array(labels)
        np.save(dataset_file.format('x' + str(i)), data)
        np.save(dataset_file.format('y' + str(i)), labels)
        data, labels = [], []

In [None]:
rows, cols = 107, 20
test_num = 12000

def get_data():
    for i in range(1, 13):
        x_batch = np.load(dataset_file.format('x' + str(i)))
        y_batch = np.load(dataset_file.format('y' + str(i)))
        for j in range(x_batch.shape[0]):
            yield (x_batch[j], y_batch[j])

dataset = tf.data.Dataset.from_generator(get_data, output_types=(tf.float32, tf.float32), output_shapes=((rows, cols), (F, )))
dataset = dataset.shuffle(1024)
test_set = dataset.take(test_num)
train_set = dataset.skip(test_num)

train_set = train_set.shuffle(1024).batch(256)