In [53]:
import numpy as np 
import pandas as pd 
import json 
import ccxt 
import seaborn as sns
import os 
import pandas_ta as ta 
import time
from datetime import datetime, timedelta
import math
from tqdm.auto import tqdm 
import matplotlib.pyplot as plt 
from transformers import * 
import torch 
from torch import Tensor 
from torch.utils.data import * 
import torch.nn as nn 
import torch.nn.functional as F 
from sklearn.utils.class_weight import compute_class_weight 
from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
from pytorch_metric_learning import miners, losses
from pytorch_metric_learning.distances import CosineSimilarity
from scipy.spatial.distance import cdist 
import pickle
import warnings
warnings.filterwarnings('ignore')

from ts2vec import TS2Vec

In [54]:
chart_embedder = TS2Vec(input_dims=1, device=0, output_dims=128) 

In [55]:
chart_embedder

<ts2vec.TS2Vec at 0x7f15ec5f43d0>

In [56]:
with open("BTC_USDT-1h-12.json") as f: 
    d = json.load(f) 

chart_df = pd.DataFrame(d) 
chart_df = chart_df.rename(columns={0:"timestamp", 1:"open", 2:"high", 3:"low", 4:"close", 5:"volume"})

def process(df): 
    binance = ccxt.binance() 
    dates = df["timestamp"].values 
    timestamp = [] 
    for i in range(len(dates)):
        date_string = binance.iso8601(int(dates[i])) 
        date_string = date_string[:10] + " " + date_string[11:-5] 
        timestamp.append(date_string) 
    df["datetime"] = timestamp
    df = df.drop(columns={"timestamp"}) 
    return df 

chart_df = process(chart_df) 

hours, days, months, years = [],[],[],[] 
for dt in tqdm(chart_df["datetime"]):
        dtobj = pd.to_datetime(dt) 
        hour = dtobj.hour 
        day = dtobj.day 
        month = dtobj.month 
        year = dtobj.year 
        hours.append(hour) 
        days.append(day) 
        months.append(month) 
        years.append(year) 

chart_df["hours"] = hours 
chart_df["days"] = days  
chart_df["months"] = months 
chart_df["years"] = years 

close = chart_df["close"].values 
datetimes = chart_df["datetime"].values 

  0%|          | 0/47346 [00:00<?, ?it/s]

In [57]:
anomalous_datetimes = [] 
lookback_window = 24
forecast_window = 6 
threshold = 1 

for i in range(lookback_window, close.shape[0]-forecast_window):
    cur_close = close[i-lookback_window:i+1] 
    delta = (cur_close[-1] - cur_close[-2]) / cur_close[-2] * 100 
    if delta >= threshold or delta <= -threshold:
        dtobj = datetime.strptime(str(datetimes[i]), "%Y-%m-%d %H:%M:%S") 
        anomalous_datetimes.append(dtobj) 

In [58]:
date_chart_df = {} 
for i in tqdm(range(lookback_window, len(datetimes) - forecast_window)):
    dtobj = datetime.strptime(str(datetimes[i]), "%Y-%m-%d %H:%M:%S") 
    date_chart_df[dtobj] = (close[i-lookback_window+1:i+1], close[i+1:i+1+forecast_window])
    
# for fairness of comparison     

news = pd.read_csv("full_news_22_01_16.csv")
titles = news["titles"].values 
contents = news["contents"].values 

years = news["year"].values 
months = news["month"].values 
days = news["day"].values 
hours = news["hour"].values 

news_dtobjects = [] 

for i in tqdm(range(len(years)), position=0, leave=True): 
    date_str = str(years[i]) + "-" + str(months[i]) + "-" + str(days[i]) + " " + str(hours[i]) + ":00:00" 
    dtobj = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")  
    news_dtobjects.append(dtobj) 

# we cannot consider sections where news data is not available 
searchable = [] # at least have one news for comparison 

for i in tqdm(range(len(anomalous_datetimes)), position=0, leave=True): 
    dtobj = anomalous_datetimes[i] 
    start_dt, end_dt = dtobj-timedelta(hours=24), dtobj 
    cnt = 0 
    for k in range(len(news_dtobjects)): 
        if news_dtobjects[k] >= start_dt and news_dtobjects[k] <= end_dt: 
            cnt += 1
        elif news_dtobjects[k] > end_dt: 
            break 
    if cnt > 0:
        searchable.append((dtobj, cnt))

past_input_dir = {} 
for i in tqdm(range(len(searchable)), position=0, leave=True):
    cur_date = searchable[i][0]
    past_input, future_input = date_chart_df[cur_date] 
    delta = (past_input[-1] - past_input[-2]) / past_input[-2] * 100.0 
    if delta >= 1.0: 
        past_input_dir[cur_date] = 0 
    elif delta <= -1.0: 
        past_input_dir[cur_date] = 1 

  0%|          | 0/47316 [00:00<?, ?it/s]

  0%|          | 0/113954 [00:00<?, ?it/s]

  0%|          | 0/6452 [00:00<?, ?it/s]

  0%|          | 0/4453 [00:00<?, ?it/s]

In [59]:
train_data = searchable[:-871] 
test_data = searchable[-871:] 

In [65]:
train_seqs = [] 
for i in tqdm(range(len(train_data)), position=0, leave=True): 
    train_seq = date_chart_df[train_data[i][0]][0] 
    return_seqs = [] 
    for j in range(1, len(train_seq)): 
        return_seqs.append(train_seq[j] / train_seq[j-1]) 
    return_seqs = np.array(return_seqs) 
    train_seqs.append(return_seqs)  

  0%|          | 0/3582 [00:00<?, ?it/s]

In [69]:
train_seqs = np.array(train_seqs).reshape((-1, 23, 1)) 

train_seqs.shape

(3582, 23, 1)

In [71]:
loss_log = model.fit(train_seqs, n_epochs=200, verbose=True) 

Epoch #28: loss=2.567473692744302
Epoch #29: loss=2.4774455649970357
Epoch #30: loss=2.4859955989726457
Epoch #31: loss=2.449990903315523
Epoch #32: loss=2.446621531328278
Epoch #33: loss=2.4728212597124246
Epoch #34: loss=2.4471188115432123
Epoch #35: loss=2.4480880391971946
Epoch #36: loss=2.4788530231056725
Epoch #37: loss=2.4300374541047445
Epoch #38: loss=2.4286659047208023
Epoch #39: loss=2.4356579053562317
Epoch #40: loss=2.428581115376254
Epoch #41: loss=2.4518371857869785
Epoch #42: loss=2.4394399670742017
Epoch #43: loss=2.4545609218657285
Epoch #44: loss=2.425500627590402
Epoch #45: loss=2.430259756443212
Epoch #46: loss=2.4145699285070994
Epoch #47: loss=2.441505257324253
Epoch #48: loss=2.4070021111868956
Epoch #49: loss=2.4143002439507453
Epoch #50: loss=2.4267249390683365
Epoch #51: loss=2.4373203002818498
Epoch #52: loss=2.4248271426812416
Epoch #53: loss=2.415366542178954
Epoch #54: loss=2.412693006040804
Epoch #55: loss=2.426825944618259
Epoch #56: loss=2.382252016944

KeyboardInterrupt: 

In [72]:
model.save("ts2vec_test") 

In [73]:
test_seqs = [] 
for i in tqdm(range(len(test_data)), position=0, leave=True): 
    test_seq = date_chart_df[test_data[i][0]][0] 
    return_seqs = [] 
    for j in range(1, len(test_seq)): 
        return_seqs.append(test_seq[j] / test_seq[j-1]) 
    return_seqs = np.array(return_seqs) 
    test_seqs.append(return_seqs)  

  0%|          | 0/871 [00:00<?, ?it/s]

In [75]:
test_seqs = np.array(test_seqs).reshape((-1, 23, 1)) 
test_seqs.shape

(871, 23, 1)

In [77]:
outputs = model.encode(test_seqs)

In [78]:
outputs.shape

(871, 23, 128)

In [80]:
np.save("ts2vec_embeddings", outputs) 

In [81]:
train_seqs.shape, test_seqs.shape

((3582, 23, 1), (871, 23, 1))

In [87]:
full_chart_emb = np.concatenate([train_seqs, test_seqs], axis=0)


In [89]:
model.encode(full_chart_emb).shape

(4453, 23, 128)