In [1]:
import pandas as pd
import data_util
from tqdm.notebook import tqdm
from data_generator import DataGenerator
from tec_an import TecAn
import numpy as np
from data_util import *
import threading
import multiprocessing
from data_agent import DataAgent, TacProcess
from sklearn.model_selection import train_test_split

In [2]:
def save(data_set, prefix = "", base_dir = "data/"):
    trainX = data_set[0]
    trainY = data_set[1]
    final_path = path + base_dir
    train_path = "{}{}trainX.npy".format(final_path, prefix)
    np.save(train_path, trainX)
    np.save("{}{}trainY.npy".format(final_path, prefix), trainY)
    if (len(data_set) > 2):
        valX = data_set[2]
        valY = data_set[3]
        np.save("{}{}valX.npy".format(final_path, prefix), valX)
        np.save("{}{}valY.npy".format(final_path, prefix), valY)
    print("Saving {} with {}".format(train_path, trainX.shape))

def get_y_data(ohlc, shift = -1):
    combined_data = ohlc.copy()
    #combined_data['return'] = np.log(combined_data / combined_data.shift(1))
    returns = (ohlc / ohlc.shift(shift))
    combined_data['return'] = returns
    combined_data['direction'] = np.where(combined_data['return'] < 1, 1, 0)
    #print(combined_data)
    #combined_data.dropna(inplace=True)
    #print(combined_data[20:40])
    #
    return combined_data['direction'].to_numpy()

def split(x, y, split, shuffle=False):
    trainX, valX, trainY, valY = train_test_split(np.array(x), np.array(y), test_size=split, shuffle=shuffle)
    print("Completed: {} {} {} {}".format(trainX.shape, trainY.shape, valX.shape, valY.shape))
    return trainX, trainY, valX, valY


def get_full_database(resample, raw_dir, base_dir = "data/"):

    full_data = base_dir + raw_dir + "/"
    data_gen = DataGenerator(random = False, base_dir = full_data)
    data_gen.rewind()
    data_count = (data_gen.steps - 100)
    #data_count = 200000

    final_x = []

    closed_prices = []

    on_new_data = lambda x: final_x.append(x)
    on_closed_price = lambda price: closed_prices.append(price)
    
    agent = DataAgent(
        resample = resample,
        on_new_data = on_new_data,
        on_closed_price = on_closed_price
    )

    print("Processing {}".format(raw_dir))

    for i in tqdm(range(data_count)):
        agent.on_new_raw_data(data_gen.next())


    closes = pd.DataFrame(closed_prices, columns = ['Close'])

    final_y = get_y_data(closes, -1)
    
    #print(agent.ohlc)
    
    return final_x, final_y, closed_prices

In [3]:
path = "./" 

import data_util
from data_generator import DataGenerator

def load_dataset(dir):
    load_datasets([dir])

def load_datasets(dirs, resample, base_dir = "data/"):
    print(dirs)
    sets = []  
    for raw_dir in dirs:

        x, y, closed_prices = get_full_database(resample = resample,
                                             raw_dir = raw_dir, 
                                             base_dir = base_dir)

        final_data = split(x, y, 0.1, shuffle=False)

        save(final_data, raw_dir)
        sets.append((x, y))
    return sets

def conc_sets(sets):
    trainX = sets[0][0]
    trainY = sets[0][1]
    for i in range(1,  len(sets)):
        data_set = sets[i]
        trainX = np.append(data_set[0], trainX, axis = 0)
        trainY = np.append(data_set[1], trainY, axis = 0)
    
    trainX, trainY, valX, valY = split(trainX, trainY, 0.1, shuffle=True)
    
    return trainX, trainY, valX, valY


def load_simple_datasets(dirs, resample, base_dir = "data/"):
    print(f"Simple: {dirs}")
    sets = []  
    for raw_dir in dirs:

        x, y, closed_prices = get_full_database(resample = resample,
                                             raw_dir = raw_dir, 
                                             base_dir = base_dir)

        save((np.array(x), closed_prices), f"simple_{raw_dir}")
        sets.append((x, closed_prices))
    return sets

def conc_simple_sets(sets):
    trainX = sets[0][0]
    trainY = sets[0][1]
    for i in range(1,  len(sets)):
        data_set = sets[i]
        trainX = np.append(data_set[0], trainX, axis = 0)
        trainY = np.append(data_set[1], trainY, axis = 0)
  
    return trainX, trainY



In [4]:
%%time

#dirs = ["omgusd", "btceur", "btcusd", "ethusd", "ltcusd", "btcusd17", "btcusdAug19"]
dirs = ["omgusd", "btceur", "btcusd", "ethusd", "ltcusd"]
#dirs = ["omgusd"]

#sets = load_datasets(dirs, '2Min')
#save(conc_sets(sets))

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.77 µs


In [8]:
%%time

dirs = ["omgusd", "btceur", "btcusd17", "btcusd", "ethusd", "ltcusd", "btcusdAug19"]
#dirs = ["omgusd", "btceur", "btcusd", "ethusd", "ltcusd"]
#dirs = ["omgusd", "4jul21"]

sets = load_simple_datasets(dirs, '2Min')


Simple: ['omgusd', 'btceur', 'btcusd17', 'btcusd', 'ethusd', 'ltcusd', 'btcusdAug19']
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing omgusd


  0%|          | 0/213310 [00:00<?, ?it/s]

Saving ./data/simple_omgusdtrainX.npy with (330, 16)
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing btceur


  0%|          | 0/656764 [00:00<?, ?it/s]

Saving ./data/simple_btceurtrainX.npy with (1905, 16)
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing btcusd17


  0%|          | 0/223969 [00:00<?, ?it/s]

Saving ./data/simple_btcusd17trainX.npy with (11214, 16)
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing btcusd


  0%|          | 0/388103 [00:00<?, ?it/s]

Saving ./data/simple_btcusdtrainX.npy with (381, 16)
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing ethusd


  0%|          | 0/639540 [00:00<?, ?it/s]

Saving ./data/simple_ethusdtrainX.npy with (1110, 16)
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing ltcusd


  0%|          | 0/543146 [00:00<?, ?it/s]

Saving ./data/simple_ltcusdtrainX.npy with (860, 16)
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing btcusdAug19


  0%|          | 0/888060 [00:00<?, ?it/s]

Saving ./data/simple_btcusdAug19trainX.npy with (17975, 16)
CPU times: user 1h 17min 8s, sys: 10min 12s, total: 1h 27min 21s
Wall time: 1h 39min 35s


In [9]:
save(conc_simple_sets(sets), prefix = "simple_full_")

Saving ./data/simple_full_trainX.npy with (33775, 16)


In [7]:
vals = ["backtest", "4jul21"]

#dirs = ["omgusd", "4jul21"]

load_simple_datasets(vals, '2Min')


Simple: ['backtest', '4jul21']
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing backtest


  0%|          | 0/1585895 [00:00<?, ?it/s]

Saving ./data/simple_backtesttrainX.npy with (2028, 16)
Resample 2Min - TecAn ( windows 20, windows_limit 100 )
Processing 4jul21


  0%|          | 0/301794 [00:00<?, ?it/s]

Saving ./data/simple_4jul21trainX.npy with (301, 16)


[([[2022.9571072390909,
    0.999978202704917,
    1562.9362986842507,
    -1584.8905,
    -792.1917500000001,
    1.0,
    0.08897150526610824,
    -7.192195226421886,
    -5.76507887078174,
    99.9707349240262,
    50.0021934544698,
    99.40423744784738,
    19.992563923376032,
    99.40423744784738,
    16172.023262932353,
    16355.873703761026],
   [3182.491617162593,
    0.9999782280731092,
    4679.683409532196,
    -3171.7720000000004,
    -1585.7115,
    1.0,
    0.1716970365842048,
    -7.375591561227257,
    -5.766153648793156,
    99.98380496821088,
    75.00109672753166,
    94.9843502409936,
    34.99088459146125,
    94.9843502409936,
    16200.331818877945,
    16365.075027071623],
   [3719.404122522542,
    0.993627238284921,
    9346.417483521562,
    -4749.733,
    -2374.8920000000003,
    0.9866878875153647,
    0.2554614432639077,
    -7.593241942562969,
    -5.801482473334914,
    99.52564208768744,
    87.50054836364079,
    88.50887878633887,
    45.6945086295