In [36]:
import os
import numpy as np
import pandas as pd
from datetime import timedelta, datetime
from tqdm import tqdm

In [2]:
FORBES = './stock_market_data/sp500/csv/'
TICKER = 'A'
PATH = os.path.join(FORBES, f"{TICKER}.csv")

In [50]:
df = pd.read_csv(PATH)
df['Date'] = pd.to_datetime(df['Date'])

In [6]:
df = df.drop(['Date'], axis=1)

In [7]:
df.describe()

Unnamed: 0,Low,Open,Volume,High,Close,Adjusted Close
count,5693.0,5693.0,5693.0,5693.0,5693.0,5693.0
mean,42.660654,43.217349,3495960.0,43.774675,43.226476,40.779416
std,33.196632,33.561343,2439198.0,33.895923,33.542048,33.884099
min,7.51073,7.653791,222150.0,7.961373,7.761087,6.652129
25%,21.67382,21.959944,1992800.0,22.281832,22.002861,19.392338
50%,29.713877,30.16452,2983100.0,30.579399,30.171675,27.384373
75%,52.259998,52.790001,4319400.0,53.419998,52.709999,48.43486
max,177.169998,179.279999,62546380.0,179.570007,179.279999,178.225525


In [29]:
def make_dataset(df: pd.DataFrame, time_d: int = 10):
    X, y = [], []

    for i in range(df.shape[0] - time_d - 1):
        idx = i + time_d
        row = df.iloc[idx + 1]
        row = row.drop(['Volume']) # remove volume is it doesn't need to be predicted
        if row.hasnans:
            continue # if the row (y) has nans, skip

        x = df.iloc[i:idx]
        x.fillna(x.mean(), inplace=True) # fill empty values with mean
        x = x[['Low', 'Open', 'High', 'Close', 'Adjusted Close', 'Volume']]

        X.append(x.values)
        y.append(row.values)
        
    return np.array(X), np.array(y)

In [30]:
X, y = make_dataset(df)

In [31]:
X.shape

(5682, 10, 6)

In [32]:
def get_all_csv_files(dir: str):
    files = []
    a = os.listdir(dir)
    for b in a:
        if os.path.isdir(os.path.join(dir, b)):
            c = os.path.join(dir, b, 'csv')
            if os.path.isdir(c):
                files += [os.path.join(c, f) for f in os.listdir(c) if f[-4:] == '.csv']

    return files

In [40]:
def large_df(files):
    return pd.concat((pd.read_csv(f) for i, f in tqdm(enumerate(files), total=len(files))), ignore_index=True)

In [41]:
def scale(X, y):
    div = 255 / np.max(X)
    X *= div
    y *= div
    return X, y

In [42]:
df = large_df(get_all_csv_files('stock_market_data'))

100%|██████████| 4281/4281 [00:34<00:00, 124.14it/s]


In [43]:
df.shape

(24603272, 7)

In [49]:
from sklearn.preprocessing import MinMaxScaler
import pickle

scl = MinMaxScaler()
scl.fit(df.drop(['Date'], axis=1))

MinMaxScaler()

In [52]:
x = scl.transform(df.drop(['Date'], axis=1))

In [58]:
ddf = pd.DataFrame(x, columns=['Low', 'Open', 'Volume', 'High', 'Close', 'Adjusted Close'])

In [59]:
ddf.describe()

Unnamed: 0,Low,Open,Volume,High,Close,Adjusted Close
count,5693.0,5693.0,5693.0,5693.0,5693.0,5693.0
mean,5.124624e-12,5.191498e-12,0.000471,5.258447e-12,5.192594e-12,0.998474
std,3.987756e-12,4.031567e-12,0.000329,4.071758e-12,4.029249e-12,0.0
min,9.022288e-13,9.19414e-13,3e-05,9.563624e-13,9.32303e-13,0.998474
25%,2.603574e-12,2.637945e-12,0.000269,2.676612e-12,2.643101e-12,0.998474
50%,3.569389e-12,3.623522e-12,0.000402,3.67336e-12,3.624382e-12,0.998474
75%,6.277749e-12,6.341415e-12,0.000582,6.417094e-12,6.331805e-12,0.998474
max,2.12826e-11,2.153607e-11,0.008428,2.15709e-11,2.153607e-11,0.998474


In [61]:
ddf[['Low', 'Open', 'High', 'Close', 'Adjusted Close', 'Volume']].describe()

Unnamed: 0,Low,Open,High,Close,Adjusted Close,Volume
count,5693.0,5693.0,5693.0,5693.0,5693.0,5693.0
mean,5.124624e-12,5.191498e-12,5.258447e-12,5.192594e-12,0.998474,0.000471
std,3.987756e-12,4.031567e-12,4.071758e-12,4.029249e-12,0.0,0.000329
min,9.022288e-13,9.19414e-13,9.563624e-13,9.32303e-13,0.998474,3e-05
25%,2.603574e-12,2.637945e-12,2.676612e-12,2.643101e-12,0.998474,0.000269
50%,3.569389e-12,3.623522e-12,3.67336e-12,3.624382e-12,0.998474,0.000402
75%,6.277749e-12,6.341415e-12,6.417094e-12,6.331805e-12,0.998474,0.000582
max,2.12826e-11,2.153607e-11,2.15709e-11,2.153607e-11,0.998474,0.008428
