In [1]:
import pandas as pd
import numpy as np

from lib.data import get_all_csv_files, large_df, make_dataset, scale, inverse_scale

In [2]:
files = get_all_csv_files('./stock_market_data/')

In [3]:
df = large_df(files)

100%|██████████| 4280/4280 [00:36<00:00, 118.50it/s]


In [4]:
df = df.drop('Date', axis=1).dropna(axis=0)

In [32]:
df = df[['Low', 'Open', 'High', 'Close', 'Volume']]

In [33]:
df.describe()

Unnamed: 0,Low,Open,High,Close,Volume
count,24351240.0,24351240.0,24351240.0,24351240.0,24351240.0
mean,79061490.0,81006080.0,82549500.0,80678510.0,2150915.0
std,13511290000.0,13828970000.0,14065610000.0,13770280000.0,19066900.0
min,0.0,0.0,0.0,0.0,0.0
25%,6.965,6.71875,7.19,7.07,18000.0
50%,16.2,16.24,16.63,16.41675,173000.0
75%,36.01,36.32,37.0,36.51,1052600.0
max,8324640000000.0,8324640000000.0,8324640000000.0,8324640000000.0,7421641000.0


In [34]:
data = df[df['Low'] < 5e3]

In [35]:
data.max()

Low       4.999460e+03
Open      2.174989e+04
High      4.450000e+04
Close     2.624987e+04
Volume    7.421641e+09
dtype: float64

In [38]:
import torch

def find_ranges(data: torch.Tensor):
    money = data[:, :-1]
    volume = data[:, -1:]

    money_range = (float(torch.min(money)), float(torch.max(money)))
    volume_range = (float(torch.min(volume)), float(torch.max(volume)))

    return money_range, volume_range

money_range, volume_range = find_ranges(torch.tensor(data.to_numpy()))
print(money_range, volume_range)

(0.0, 44500.0) (0.0, 7421640800.0)


In [40]:
def scale(data: torch.Tensor, new_min=0, new_max=1):
    global money_range
    global volume_range
    money = data[:, :4]
    volume = data[:, -1:]

    money = (money - money_range[0]) / (money_range[1] - money_range[0]) * (new_max - new_min) + new_min
    volume = (volume - volume_range[0]) / (volume_range[1] - volume_range[0]) * (new_max - new_min) + new_min
    return torch.hstack((money, volume))

In [45]:
def make_dataset(df: torch.Tensor, time_d: int = 10):
    Xd, yd = None, None

    for i in range(df.shape[0] - time_d - 1):
        idx = i + time_d
        row = df[idx + 1]
        row = row[:-1] # remove volume is it doesn't need to be predicted
        x = df[i:idx]
        if torch.isnan(torch.sum(x)) or torch.isnan(torch.sum(row)):
            continue # if the row (y) has nans, skip

        if type(Xd) == type(None):
            Xd = x
            yd = row
        else:
            Xd = torch.concat((Xd, x))
            yd = torch.concat((yd, row))
        
    return Xd, yd

In [54]:
from tqdm import tqdm

X, y = None, None

loop = tqdm(files, total=len(files))
for file in loop:
    tmp = pd.read_csv(file)
    tmp = tmp[['Low', 'Open', 'High', 'Close', 'Volume']].dropna()
    tmp = tmp[tmp['Low'] < 5e3]

    scaled = scale(torch.Tensor(tmp.to_numpy()))
    dataset = make_dataset(scaled)

    if type(dataset[0]) == type(None) or type(dataset[1]) == type(None):
        print(f"file {file} is useless")
        continue

    if type(X) == type(None):
        X = dataset[0]
        y = dataset[1]
    else:
        X = torch.concat((X, dataset[0]))
        y = torch.concat((y, dataset[1]))

 32%|███▏      | 1363/4280 [24:10<42:59,  1.13it/s]  

file ./stock_market_data/nasdaq\csv\CCUR.csv is useless


 36%|███▌      | 1551/4280 [27:57<1:08:53,  1.51s/it]

file ./stock_market_data/nasdaq\csv\DXM.csv is useless


 40%|███▉      | 1698/4280 [31:10<36:06,  1.19it/s]  

file ./stock_market_data/nasdaq\csv\FSFF.csv is useless


 63%|██████▎   | 2713/4280 [54:44<51:25,  1.97s/it]  

file ./stock_market_data/nyse\csv\ACG.csv is useless


 65%|██████▌   | 2796/4280 [56:57<48:14,  1.95s/it]  

file ./stock_market_data/nyse\csv\AVG.csv is useless


 73%|███████▎  | 3108/4280 [1:06:03<19:29,  1.00it/s]  

file ./stock_market_data/nyse\csv\FGL.csv is useless


 73%|███████▎  | 3127/4280 [1:06:32<19:06,  1.01it/s]

file ./stock_market_data/nyse\csv\FPT.csv is useless


100%|██████████| 4280/4280 [1:55:26<00:00,  1.62s/it]  


In [55]:
X.shape

torch.Size([242100200, 5])

In [56]:
import pickle

In [57]:
pickle.dump(X, open('./data/Xlarge.bin', 'wb'))

In [58]:
pickle.dump(y, open('./data/ylarge.bin', 'wb'))

In [16]:
import torch, pickle

In [17]:
x = pickle.load(open('./data/X.bin', 'rb'))

In [21]:
x.reshape((6906777, 10, 5)).shape

torch.Size([6906777, 10, 5])

In [13]:
c = torch.Tensor(2, 7, 5)

In [15]:
torch.cat((a, b))

tensor([[[0.5939, 0.6351, 0.9802, 0.4510, 0.4155],
         [0.1423, 0.9028, 0.1143, 0.7449, 0.7464],
         [0.3748, 0.6898, 0.1778, 0.4913, 0.3977],
         [0.8381, 0.5423, 0.1430, 0.4897, 0.9605],
         [0.5049, 0.6050, 0.2317, 0.6488, 0.9490],
         [0.9069, 0.8218, 0.7441, 0.8825, 0.8529],
         [0.0739, 0.9540, 0.8932, 0.2987, 0.5890]],

        [[0.1823, 0.8595, 0.1832, 0.9272, 0.9944],
         [0.7184, 0.2080, 0.2378, 0.2978, 0.4247],
         [0.2342, 0.6028, 0.7682, 0.6944, 0.3655],
         [0.6381, 0.3703, 0.3003, 0.3945, 0.7956],
         [0.4893, 0.5456, 0.0970, 0.7302, 0.4601],
         [0.1616, 0.1483, 0.7895, 0.0596, 0.5900],
         [0.3478, 0.8016, 0.8224, 0.1190, 0.4755]]])

In [11]:
c.shape

torch.Size([14, 5])