In [40]:
import os
import numpy as np
import pandas as pd
from datetime import timedelta, datetime
from tqdm import tqdm

In [45]:
FORBES = './stock_market_data/forbes2000/csv/'
TICKER = 'ABTZY'
PATH = os.path.join(FORBES, f"{TICKER}.csv")

In [46]:
df = pd.read_csv(PATH)
df['Date'] = pd.to_datetime(df['Date'])

In [47]:
df = df.drop(['Date'], axis=1)

In [48]:
df.describe()

Unnamed: 0,Low,Open,Volume,High,Close,Adjusted Close
count,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0
mean,12.077596,12.077886,30.567777,12.080235,12.079991,11.113821
std,2.47394,2.473336,364.174032,2.473555,2.474034,1.995029
min,5.79,5.9,0.0,5.9,5.9,5.761958
25%,10.27,10.27,0.0,10.27,10.27,10.029713
50%,12.17,12.17,0.0,12.17,12.17,10.77667
75%,14.22,14.22,0.0,14.22,14.22,12.86145
max,17.82,17.82,13100.0,17.82,17.82,16.033495


In [49]:
df = df[['Low', 'Open', 'High', 'Close', 'Adjusted Close', 'Volume']]

In [75]:
def make_dataset(df: np.ndarray, time_d: int = 10):
    X, y = [], []

    for i in range(df.shape[0] - time_d - 1):
        idx = i + time_d
        row = df[idx + 1]
        row = row[:-1] # remove volume is it doesn't need to be predicted
        if np.isnan(np.sum(row)):
            continue

        x = df[i:idx]
        if np.isnan(np.sum(x)):
            continue # if the row (y) has nans, skip

        print(i, x, row)
        X.append(x)
        y.append(row)

        break
        
    return np.array(X), np.array(y)

In [76]:
X, y = make_dataset(df.to_numpy())

141 [[ 11.5         11.5         11.5         11.5          9.69760513
  100.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]
 [ 11.5         11.5         11.5         11.5          9.69760513
    0.        ]] [11.5        11.5        11.5        11.5         9.69760513]


In [66]:
np.isnan(np.sum(X[0]))

True

In [55]:
X[0, 0] = 0.3343
np.isnan(np.sum(X))

True

In [57]:
X[0, :-1]

array([[2.86123028e+01, 3.25464935e+01, 6.25463800e+07, 3.57653809e+01,
        3.14735336e+01, 2.69763737e+01],
       [2.84781837e+01, 3.07135181e+01, 1.52341460e+07, 3.07582264e+01,
        2.88805447e+01, 2.47538853e+01],
       [2.86570091e+01, 2.95511436e+01, 6.57787000e+06, 3.14735336e+01,
        3.14735336e+01, 2.69763737e+01],
       [2.86123028e+01, 3.04005718e+01, 5.97561100e+06, 3.12052937e+01,
        2.86123028e+01, 2.45239677e+01],
       [2.86123028e+01, 2.87017174e+01, 4.84323100e+06, 2.99982128e+01,
        2.93723183e+01, 2.51753941e+01],
       [2.91487846e+01, 2.92381973e+01, 1.72946600e+06, 2.96852646e+01,
        2.94617310e+01, 2.52520294e+01],
       [2.90146637e+01, 2.93276119e+01, 4.07475100e+06, 3.03558655e+01,
        3.01323318e+01, 2.58268089e+01],
       [2.92829037e+01, 3.00429173e+01, 4.31003400e+06, 3.07135181e+01,
        3.01770382e+01, 2.58651333e+01],
       [2.99535046e+01, 3.01770382e+01, 2.95732900e+06, 3.10711727e+01,
        3.07135181e+01, 

In [12]:
def get_all_csv_files(dir: str):
    files = []
    a = os.listdir(dir)
    for b in a:
        if os.path.isdir(os.path.join(dir, b)):
            c = os.path.join(dir, b, 'csv')
            if os.path.isdir(c):
                files += [os.path.join(c, f) for f in os.listdir(c) if f[-4:] == '.csv']

    return files

In [13]:
def large_df(files):
    return pd.concat((pd.read_csv(f) for i, f in tqdm(enumerate(files), total=len(files))), ignore_index=True)

In [14]:
def scale(X, y):
    div = 255 / np.max(X)
    X *= div
    y *= div
    return X, y

In [16]:
files = get_all_csv_files('stock_market_data')

In [18]:
for file in files:
    if pd.read_csv(file).isnull().values.any():
        print(file)

stock_market_data\forbes2000\csv\ABTZY.csv
stock_market_data\forbes2000\csv\ADERY.csv
stock_market_data\forbes2000\csv\AET.csv
stock_market_data\forbes2000\csv\AFSI.csv
stock_market_data\forbes2000\csv\AUNB.csv
stock_market_data\forbes2000\csv\BHI.csv
stock_market_data\forbes2000\csv\BKRL.csv
stock_market_data\forbes2000\csv\BKUT.csv
stock_market_data\forbes2000\csv\CAA.csv
stock_market_data\forbes2000\csv\CBI.csv
stock_market_data\forbes2000\csv\CBUMF.csv
stock_market_data\forbes2000\csv\CCE.csv
stock_market_data\forbes2000\csv\CCGY.csv
stock_market_data\forbes2000\csv\CGYV.csv
stock_market_data\forbes2000\csv\CHNGQ.csv
stock_market_data\forbes2000\csv\COL.csv
stock_market_data\forbes2000\csv\CPFH.csv
stock_market_data\forbes2000\csv\CPICQ.csv
stock_market_data\forbes2000\csv\CTBK.csv
stock_market_data\forbes2000\csv\CTC.csv
stock_market_data\forbes2000\csv\ELROF.csv
stock_market_data\forbes2000\csv\ESRX.csv
stock_market_data\forbes2000\csv\GBNK.csv
stock_market_data\forbes2000\csv\GK

KeyboardInterrupt: 

In [8]:
df = large_df()

 15%|█▌        | 653/4280 [00:08<00:50, 71.37it/s] 

KeyboardInterrupt: 

In [8]:
df.shape

(24603272, 7)

In [9]:
from sklearn.preprocessing import MinMaxScaler
import pickle

scl = MinMaxScaler()
scl.fit(df.drop(['Date'], axis=1))

In [10]:
pickle.dump(scl, open('minmax_scaler.bin', 'wb'))

In [52]:
x = scl.transform(df.drop(['Date'], axis=1))

In [58]:
ddf = pd.DataFrame(x, columns=['Low', 'Open', 'Volume', 'High', 'Close', 'Adjusted Close'])

In [59]:
ddf.describe()

Unnamed: 0,Low,Open,Volume,High,Close,Adjusted Close
count,5693.0,5693.0,5693.0,5693.0,5693.0,5693.0
mean,5.124624e-12,5.191498e-12,0.000471,5.258447e-12,5.192594e-12,0.998474
std,3.987756e-12,4.031567e-12,0.000329,4.071758e-12,4.029249e-12,0.0
min,9.022288e-13,9.19414e-13,3e-05,9.563624e-13,9.32303e-13,0.998474
25%,2.603574e-12,2.637945e-12,0.000269,2.676612e-12,2.643101e-12,0.998474
50%,3.569389e-12,3.623522e-12,0.000402,3.67336e-12,3.624382e-12,0.998474
75%,6.277749e-12,6.341415e-12,0.000582,6.417094e-12,6.331805e-12,0.998474
max,2.12826e-11,2.153607e-11,0.008428,2.15709e-11,2.153607e-11,0.998474


In [61]:
ddf[['Low', 'Open', 'High', 'Close', 'Adjusted Close', 'Volume']].describe()

Unnamed: 0,Low,Open,High,Close,Adjusted Close,Volume
count,5693.0,5693.0,5693.0,5693.0,5693.0,5693.0
mean,5.124624e-12,5.191498e-12,5.258447e-12,5.192594e-12,0.998474,0.000471
std,3.987756e-12,4.031567e-12,4.071758e-12,4.029249e-12,0.0,0.000329
min,9.022288e-13,9.19414e-13,9.563624e-13,9.32303e-13,0.998474,3e-05
25%,2.603574e-12,2.637945e-12,2.676612e-12,2.643101e-12,0.998474,0.000269
50%,3.569389e-12,3.623522e-12,3.67336e-12,3.624382e-12,0.998474,0.000402
75%,6.277749e-12,6.341415e-12,6.417094e-12,6.331805e-12,0.998474,0.000582
max,2.12826e-11,2.153607e-11,2.15709e-11,2.153607e-11,0.998474,0.008428


In [18]:
df.iloc[:10].values[:, :5]

array([['18-11-1999', 28.612302780151367, 32.54649353027344, 62546380.0,
        35.765380859375],
       ['19-11-1999', 28.47818374633789, 30.713518142700195, 15234146.0,
        30.75822639465332],
       ['22-11-1999', 28.65700912475586, 29.551143646240234, 6577870.0,
        31.473533630371094],
       ['23-11-1999', 28.612302780151367, 30.40057182312012, 5975611.0,
        31.205293655395508],
       ['24-11-1999', 28.612302780151367, 28.701717376708984, 4843231.0,
        29.99821281433105],
       ['26-11-1999', 29.14878463745117, 29.23819732666016, 1729466.0,
        29.685264587402344],
       ['29-11-1999', 29.014663696289062, 29.327611923217773, 4074751.0,
        30.355865478515625],
       ['30-11-1999', 29.28290367126465, 30.042917251586918, 4310034.0,
        30.713518142700195],
       ['01-12-1999', 29.95350456237793, 30.177038192749023, 2957329.0,
        31.0711727142334],
       ['02-12-1999', 30.8923454284668, 31.29470634460449, 3069868.0,
        32.1888427734375]