In [1]:
import pandas as pd
import data_util
from tqdm import tqdm_notebook as tqdm
from data_generator import DataGenerator
from state_util import StateUtil
from tec_an import TecAn
import numpy as np
from data_util import *
import threading
import multiprocessing
from data_agent import DataAgent, TacProcess


In [2]:
future = 10

def get_tecs(raw_dir = "stock_data/"):
    data_gen = DataGenerator(random=False, base_dir = raw_dir)
    data_gen.rewind()
    stateUtil = StateUtil(data_gen, future = future)
    data = []
    data_count = data_gen.max_steps()
    tec = TecAn()
    for i in tqdm(range(data_count)):
        raw = data_gen.next()
        price = raw[stateUtil.PRICE_KEY]
        amount = raw[stateUtil.AMOUNT_KEY]
        data.append([price, amount])
    
    def calcule_tecs():
        tecs = []
        print("Data {}".format(len(data)))
        df = pd.DataFrame(data, columns = ['Close', 'Volume'])
        close = df['Close']
        volume = df['Volume']
        for ta in tec.tas:
            value = ta(close, volume, 0.0, 0.0)
            tecs.append(value)
        return tecs
    
    return calcule_tecs()


In [12]:
def save(data_set, prefix = "", base_dir = "data/"):
    trainX = data_set[0]
    trainY = data_set[1]
    valX = data_set[2]
    valY = data_set[3]
    final_path = path + base_dir
    train_path = "{}{}trainX.npy".format(final_path, prefix)
    np.save(train_path, trainX)
    np.save("{}{}trainY.npy".format(final_path, prefix), trainY)
    np.save("{}{}valX.npy".format(final_path, prefix), valX)
    np.save("{}{}valY.npy".format(final_path, prefix), valY)
    print("Saving {} with {}".format(train_path, trainX.shape))

def get_y_data(ohlc, shift = -1):
    combined_data = ohlc
    #combined_data['return'] = np.log(combined_data / combined_data.shift(1))
    returns = (ohlc / ohlc.shift(shift))
    combined_data['return'] = returns
    combined_data['direction'] = np.where(combined_data['return'] < 1, 1, 0)
    #print(combined_data)
    #combined_data.dropna(inplace=True)
    return combined_data['direction']

def split(x, y, split):
    limit = int(len(y)*split)
    
    trainX = np.array(x[limit:].copy())
    trainY = np.array(y[limit:].copy())
    
    valX = np.array(x[:limit].copy())
    valY = np.array(y[:limit].copy())
    print("Completed: {} {} {} {}".format(trainX.shape, trainY.shape, valX.shape, valY.shape))
    return trainX, trainY, valX, valY


TIMESTAMP_KEY = "timestamp"
MICROTIMESTAMP_KEY = "microtimestamp"
ASKS_KEY = "asks"
BIDS_KEY = "bids"
PRICE_KEY = "price"
AMOUNT_KEY = "amount"
CLOSE = 'close'

In [15]:
path = "./" 

windows = 30

import data_util
from data_generator import DataGenerator

def load_dataset(dir):
    load_datasets([dir])

def load_datasets(dirs, resample, base_dir = "data/"):
    print(dirs)
    sets = []  
    for raw_dir in dirs:
        full_data = base_dir + raw_dir + "/"
        data_gen = DataGenerator(random = False, base_dir = full_data)
        data_gen.rewind()
        data_count = (data_gen.steps - 100)
        #data_count = 20000
        
        final_x = []
        
        closed_prices = []
        
        on_new_data = lambda x: final_x.append(x)
        on_closed_price = lambda price: closed_prices.append(price)
 
        agent = DataAgent(
            resample = resample,
            on_new_data = on_new_data,
            on_closed_price = on_closed_price
        )
        
        print("Processing {}".format(raw_dir))
        
        for i in tqdm(range(data_count)):
            agent.on_new_raw_data(data_gen.next())
            
        ohlc = agent.ohlc
        
        closes = pd.DataFrame(closed_prices, columns = [CLOSE])
        
        final_y = get_y_data(closes, -2)

        #print("Lens: {} {}".format(len(final_x), len(final_y)))
        #print(closes)
        #print(final_y)

        final_data = split(final_x, final_y, 0.1)

        save(final_data, raw_dir)
        sets.append(final_data)
    return sets

def conc_sets(sets):
    trainX = sets[0][0]
    trainY = sets[0][1]
    valX = sets[0][2]
    valY = sets[0][3]
    for i in range(1,  len(sets)):
        data_set = sets[i]
        trainX = np.append(data_set[0], trainX, axis = 0)
        trainY = np.append(data_set[1], trainY, axis = 0)
        valX = np.append(data_set[2], valX, axis = 0)
        valY = np.append(data_set[3], valY, axis = 0)
    return trainX, trainY, valX, valY



In [None]:
#dirs = ["omgusd", "btceur", "btcusd", "ethusd", "ltcusd", "btcusd17", "btcusdAug19"]
dirs = ["omgusd", "btceur", "btcusd", "ethusd", "ltcusd"]
#dirs = ["omgusd"]

sets = load_datasets(dirs, '2Min')
save(conc_sets(sets))

['omgusd', 'btceur', 'btcusd', 'ethusd', 'ltcusd']
Processing omgusd


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(data_count)):


  0%|          | 0/213310 [00:00<?, ?it/s]

Completed: (675, 26) (675,) (75, 26) (75,)
Saving ./data/omgusdtrainX.npy with (675, 26)
Processing btceur


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(data_count)):


  0%|          | 0/656764 [00:00<?, ?it/s]

In [9]:
sets[0][0]

array([[5.95],
       [5.95],
       [5.92],
       [5.92],
       [5.9 ],
       [5.92],
       [5.97],
       [6.01],
       [5.98],
       [6.01],
       [6.  ],
       [5.99],
       [5.99],
       [5.99],
       [6.03],
       [6.03],
       [6.03],
       [5.97]])