In [1]:
from fastai.tabular.all import *
from tqdm.autonotebook import tqdm
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
training_path = 'E:/BitBot/training_data_sections/'
models_path = 'E:/BitBot/models'
if not os.path.exists(models_path):
    os.makedirs(models_path)

In [39]:
# Read symbols and timestamps
first_timestamp = set()
symbols, timestamps_train, timestamps_valid = set(), set(), set()
for filename in os.listdir(training_path):
    if 'train' in filename:
        timestamps_train.add(filename[:10])
    elif 'val' in filename:
        timestamps_valid.add(filename[:10])
    else:
        symbols.add(filename[11:-4])
        first_timestamp.add(filename[:10])

first_timestamp = first_timestamp.pop()

symbols = sorted(list(symbols))
timestamps_train, timestamps_valid = sorted(list(timestamps_train)), sorted(list(timestamps_valid))
timestamps = {}
for timestamp_train, timestamp_valid in zip(timestamps_train, timestamps_valid):
    timestamps[timestamp_train] = timestamp_valid
timestamps_valid = timestamps
timestamps_train = list(timestamps.keys())
del timestamps

print(symbols)
print(str(timestamps_valid)[:min(200, len(str(timestamps_valid)))])

['ADAUSDT', 'ATOMUSDT', 'BCHUSDT', 'BNBUSDT', 'BTCUSDT', 'BTTUSDT', 'CHZUSDT', 'DOGEUSDT', 'EOSUSDT', 'ETCUSDT', 'ETHUSDT', 'FTMUSDT', 'LINKUSDT', 'LTCUSDT', 'MATICUSDT', 'NEOUSDT', 'THETAUSDT', 'TRXUSDT', 'VETUSDT', 'XLMUSDT', 'XRPUSDT']
{'2020-01-03': '2020-07-03', '2020-01-04': '2020-07-04', '2020-01-05': '2020-07-05', '2020-01-06': '2020-07-06', '2020-01-07': '2020-07-07', '2020-01-08': '2020-07-08', '2020-01-09': '2020-07-09', '20


In [40]:
# Read training data
#symbols = ['ADAUSDT', 'ATOMUSDT']
#timestamps_train = timestamps_train[0:4]

training_data = {}
for symbol in tqdm(symbols, desc='Read data'):
    filename = first_timestamp + "_" + symbol + ".csv"
    df = pd.read_csv(training_path + filename, index_col='ind_idx')
    training_data[symbol] = df

train_idc, val_idc = {}, {}
for timestamp in tqdm(timestamps_train, desc='Read indices'):
    train_idc[timestamp], val_idc[timestamp] = {}, {}
    for symbol in symbols:
        train_idc[timestamp][symbol] = list(pd.read_csv(training_path + timestamp + "_train_" + symbol + ".csv")['ind_idx'])
        val_idc[timestamp][symbol] = list(pd.read_csv(training_path + timestamps_valid[timestamp] + "_val_" + symbol + ".csv")['ind_idx'])

HBox(children=(FloatProgress(value=0.0, description='Read data', max=21.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Read indices', max=456.0, style=ProgressStyle(description…




In [None]:

    column_count = 0
    for filename in os.listdir(training_path):
        is_train = timestamp_train in filename
        is_valid = timestamp_valid in filename
        if not is_train and not is_valid:
            continue
        df = pd.read_csv(training_path + filename)
        if column_count == 0:
            column_count = len(df.columns)
        elif column_count != len(df.columns):
            print("Bad file", training_path + filename)
            print(df.columns)
            return None, None
        if timestamp_train in filename and 'train' in filename:            
            dfs_train.append(df)
        elif timestamp_valid in filename and 'valid' in filename:
            dfs_valid.append(df)
    dfs_train, dfs_valid = pd.concat(dfs_train), pd.concat(dfs_valid)
    return dfs_train, dfs_valid

In [63]:
def make_training_data(timestamp):
    timestamp_train, timestamp_valid = timestamp, timestamps_valid[timestamp]
    dfs_train, dfs_valid = [], []
    for symbol in symbols:
        dfs_train.append(training_data[symbol].loc[training_data[symbol].index[train_idc[timestamp_train][symbol]]])
        dfs_valid.append(training_data[symbol].loc[training_data[symbol].index[val_idc[timestamp_train][symbol]]])
        #dfs_train.append(training_data[symbol].iloc[train_idc[timestamp_train][symbol]])
        #dfs_valid.append(training_data[symbol].iloc[val_idc[timestamp_train][symbol]])
    dfs_train, dfs_valid = pd.concat(dfs_train), pd.concat(dfs_valid)
    dfs_train.reset_index(drop=True, inplace=True)
    dfs_valid.reset_index(drop=True, inplace=True)
    return dfs_train, dfs_valid

#timestamp = timestamps_train[0]
#df_train, df_valid = make_training_data(timestamp)
#print(df_valid[['ADAUSDT', 'ATOMUSDT', 'BTCUSDT']])

In [64]:
def make_splits(dfs_train, dfs_valid):
    len_train, len_valid = dfs_train.shape[0], dfs_valid.shape[0]
    splits = [
        list(range(0, len_train)),
        list(range(len_train, len_train + len_valid))
    ]
    df = pd.concat([dfs_train, dfs_valid])
    return df, splits

In [81]:
def make_dataloader(df, splits):
    #print(df.columns)
    y_count = 13
    cat_names = list(df.columns)[-len(symbols)-y_count:-y_count]
    cont_names = list(df.columns)[1:-len(symbols)-y_count]
    y_names = list(df.columns)[-y_count:]
    #print(cat_names)
    #print(cont_names)
    #print(y_names)
    to = TabularPandas(df, procs=[Categorify], cat_names=cat_names, cont_names=cont_names, y_names=y_names, splits=splits)
    dataloader = to.dataloaders(bs=2**10)
    return dataloader

In [82]:
def train(dataloader):
    learn = tabular_learner(dataloader, layers=[500, 400, 300, 200, 150, 100, 50], metrics=rmse)
    learn.fit_one_cycle(5, lr_max=5e-5)
    return learn

In [83]:
def make_predictions(timestamp, df, learn, splits):
    dl_train = DataLoader(dataset=df.iloc[splits[0]])
    df_val = DataLoader(dataset=df.iloc[splits[1]])
    df_train, df_val = df.iloc[splits[0]], df.iloc[splits[1]]
    dl_train = learn.dls.test_dl(df_train)
    dl_val = learn.dls.test_dl(df_val)
    pred_train, gt_train = learn.get_preds(dl=dl_train)
    pred_val, gt_val = learn.get_preds(dl=dl_val)
    os.makedirs('preds', exist_ok=True)
    with open(f'preds/preds_{timestamp}.pickle', 'wb') as f:
        pickle.dump({
            'pred_train': pred_train.squeeze(),
            'gt_train': gt_train.squeeze(),
            'pred_val': pred_val.squeeze(),
            'gt_val': gt_val.squeeze()
        }, f)

In [None]:
#started = False
for timestamp_train in timestamps_train:
    #if timestamp_train == "2020-07-13":
    #    started = True
    #if not started:
    #    continue
    
    timestamp_valid = timestamps_valid[timestamp_train]
    print(f"{timestamp_train} - {timestamp_valid}")
    dfs_train, dfs_valid = make_training_data(timestamp_train)
    df, splits = make_splits(dfs_train, dfs_valid)
    dataloader = make_dataloader(df, splits)
    learn = train(dataloader)
    make_predictions(timestamp_train, df, learn, splits)
    learn.export(models_path + f"/model_{timestamp_train}_{timestamp_valid}.pickle")
    
    break

2020-01-03 - 2020-07-03


In [None]:
df

In [None]:
learn.show_results(max_n=10)

In [None]:
test_dl = learn.dls.test_dl(df)

In [None]:
df = pd.read_csv("E:/BitBot/training_data_sections/2021-07-21_XRPUSDT_valid.csv")

In [None]:
test_dl = learn.dls.test_dl(df)

In [None]:
preds = learn.get_preds(dl=test_dl)

In [None]:
preds[0].numpy()