In [2]:
# importing libraries

import tensorflow as tf
import numpy as np
import pandas as pd
import glob

In [2]:
# defining params

# W must be less than or equal to CorrW
W = 20
CorrW = 50
F = 5 

M = 0
N = 0

# 0-DateTime, 1-Close, 2-Open, 3-High, 4-Low, 5-Vol
SD = 0

# 0-DateTime, 1-Close, 2-Open
MD = 1

In [4]:
# files and dirs

raw_stocks = './prepared_data/stocks.npy'
raw_markets = './prepared_data/markets.npy'
raw_data = './prepared_data/stocks_correlations_markets_{}.npy'
dataset_file = './prepared_data/dataset_{}.npy'


In [4]:
stocks = np.load(raw_stocks, allow_pickle=True)
markets = np.load(raw_markets, allow_pickle=True)

print(stocks.shape)
print(markets.shape)

M = markets.shape[0]
N = stocks.shape[0]

(572,)
(51,)


In [5]:
"""
This section holds the definition of some utility functions which we use later.
"""

def build_corr_list(stock, market, CorrW):
    market_corr = []
    send = stock.shape[0] 
    mend = market.shape[0]
    while (mend >= CorrW) and (send >= CorrW):
        s_window = stock[send - CorrW : send]
        m_window = market[mend - CorrW : mend]
        corr_matrix = np.corrcoef(s_window, m_window)
        corr = corr_matrix[0, 1]
        market_corr.append(corr)
        send -= 1
        mend -= 1
    return market_corr[::-1]

def trim_2d_list(lst):
    min_length = min([len(x) for x in lst])
    for i in range(len(lst)):
        lst[i] = lst[i][-min_length:]

def save_to_file(data, labels, filename):
    writer = tf.data.experimental.TFRecordWriter(filename)
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    dataset = dataset.map(tf.io.serialize_tensor)
    print('done')
    print(dataset)
    writer.write(dataset)
    return [], []

In [37]:
all_data = []

for i in range(257, N):
    # Build a 2D list contatining all the correlations between this stock and all the markets.
    markets_corr = [] 
    for j in range(M):
        corr_list = build_corr_list(stocks[i][:, SD], markets[j][:, MD], CorrW)
        markets_corr.append(corr_list)
    trim_2d_list(markets_corr)

    # Trimming this stock and all the markets
    X = len(markets_corr[0])
    stock_close = stocks[i][-X:, 1]
    stock_open = stocks[i][-X:, 2]
    stock_high = stocks[i][-X:, 3]
    stock_low = stocks[i][-X:, 4]
    stock_vol = stocks[i][-X:, 5]
    trim_markets = []
    for j in range(M):
        trim_markets.append(markets[j][-X:, MD])

    # Concatenate all the previous parts
    data = [stock_close, stock_open, stock_high, stock_low, stock_vol, *markets_corr, *trim_markets]

    print('{} of {} done'.format(i + 1, N))
    np.save(raw_data.format(i + 1), data)

258 of 572 done
259 of 572 done
260 of 572 done
261 of 572 done
262 of 572 done
263 of 572 done
264 of 572 done
265 of 572 done
266 of 572 done
267 of 572 done
268 of 572 done
269 of 572 done
270 of 572 done
271 of 572 done
272 of 572 done
273 of 572 done
274 of 572 done
275 of 572 done
276 of 572 done
277 of 572 done
278 of 572 done
279 of 572 done
280 of 572 done
281 of 572 done
282 of 572 done
283 of 572 done
284 of 572 done
285 of 572 done
286 of 572 done
287 of 572 done
288 of 572 done
289 of 572 done
290 of 572 done
291 of 572 done
292 of 572 done
293 of 572 done
294 of 572 done
295 of 572 done
296 of 572 done
297 of 572 done
298 of 572 done
299 of 572 done
300 of 572 done
301 of 572 done
302 of 572 done
303 of 572 done
304 of 572 done
305 of 572 done
306 of 572 done
307 of 572 done
308 of 572 done
309 of 572 done
310 of 572 done
311 of 572 done
312 of 572 done
313 of 572 done
314 of 572 done
315 of 572 done
316 of 572 done
317 of 572 done
318 of 572 done
319 of 572 done
320 of 5

In [7]:
data, labels = [], []

for i in range(1, 573):
    index = 0
    raw = np.load(raw_data.format(i))
    if len(raw) < W:
        continue
    while index + W + F < raw.shape[1]:
        x = np.array(raw[:, index:index+W])
        y = np.array(raw[0, index+W:index+W+F])
        data.append(x)
        labels.append(y)
        index += 1

    print('{} done.'.format(i))
    if (i % 50 == 0) or (i == 572):
        data = np.array(data)
        labels = np.array(labels)
        print(data.shape)
        print(labels.shape)
        np.save(dataset_file.format('x' + str(i)), data)
        np.save(dataset_file.format('y' + str(i)), labels)
        data, labels = [], []

1 done.
2 done.
3 done.
4 done.
5 done.
6 done.
7 done.
8 done.
9 done.
10 done.
11 done.
12 done.
13 done.
14 done.
15 done.
16 done.
17 done.
18 done.
19 done.
20 done.
21 done.
22 done.
23 done.
24 done.
25 done.
26 done.
27 done.
28 done.
29 done.
30 done.
31 done.
32 done.
33 done.
34 done.
35 done.
36 done.
37 done.
38 done.
39 done.
40 done.
41 done.
42 done.
43 done.
44 done.
45 done.
46 done.
47 done.
48 done.
49 done.
50 done.
(53509, 107, 20)
(53509, 5)
51 done.
52 done.
53 done.
54 done.
55 done.
56 done.
57 done.
58 done.
59 done.
60 done.
61 done.
62 done.
63 done.
64 done.
65 done.
66 done.
67 done.
68 done.
69 done.
70 done.
71 done.
72 done.
73 done.
74 done.
75 done.
76 done.
77 done.
78 done.
79 done.
80 done.
81 done.
82 done.
83 done.
84 done.
85 done.
86 done.
87 done.
88 done.
89 done.
90 done.
91 done.
92 done.
93 done.
94 done.
95 done.
96 done.
97 done.
98 done.
99 done.
100 done.
(54496, 107, 20)
(54496, 5)
101 done.
102 done.
103 done.
104 done.
105 done.
10