In [1]:
import numpy as np 
import pandas as pd 
import json 
import ccxt 
import seaborn as sns
import os 
import pandas_ta as ta 
import time
from datetime import datetime, timedelta
import math
from tqdm.auto import tqdm 
import matplotlib.pyplot as plt 
from transformers import * 
import torch 
from torch import Tensor 
from torch.utils.data import * 
import torch.nn as nn 
import torch.nn.functional as F 
from sklearn.utils.class_weight import compute_class_weight 
from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
from pytorch_metric_learning import miners, losses
from pytorch_metric_learning.distances import CosineSimilarity
from scipy.spatial.distance import cdist 
import pickle



In [2]:
# utility for plotting function

def plot_series(x_series, y_series):
    # 입력 series와 출력 series를 연속적으로 연결하여 시각적으로 보여주는 코드 입니다.
    plt.plot(x_series, label = 'past_series')
    plt.plot(np.arange(len(x_series), len(x_series)+len(y_series)),
             y_series, label = 'future_series') 
    plt.legend()

### augmented data

In [3]:
with open("simcse_aug.pkl", "rb") as handle: 
    simcse_aug_saved = pickle.load(handle) 

In [6]:
samples = [datetime(2017, 9, 29, 11, 0), datetime(2017, 10, 3, 9, 0), datetime(2018, 2, 28, 3, 0)]

In [7]:
simcse_aug_saved[samples[0]]

[datetime.datetime(2017, 8, 31, 0, 0),
 datetime.datetime(2017, 8, 23, 14, 0),
 datetime.datetime(2017, 9, 17, 11, 0)]

In [8]:
with open("BTC_USDT-1h-12.json") as f: 
        d = json.load(f) 

chart_df = pd.DataFrame(d) 
chart_df = chart_df.rename(columns={0:"timestamp", 1:"open", 2:"high", 3:"low", 4:"close", 5:"volume"})

def process(df): 
        binance = ccxt.binance() 
        dates = df["timestamp"].values 
        timestamp = [] 
        for i in range(len(dates)):
                date_string = binance.iso8601(int(dates[i])) 
                date_string = date_string[:10] + " " + date_string[11:-5] 
                timestamp.append(date_string) 
        df["datetime"] = timestamp
        df = df.drop(columns={"timestamp"}) 
        return df 

chart_df = process(chart_df) 

hours, days, months, years = [],[],[],[] 
for dt in tqdm(chart_df["datetime"]):
        dtobj = pd.to_datetime(dt) 
        hour = dtobj.hour 
        day = dtobj.day 
        month = dtobj.month 
        year = dtobj.year 
        hours.append(hour) 
        days.append(day) 
        months.append(month) 
        years.append(year) 

chart_df["hours"] = hours 
chart_df["days"] = days  
chart_df["months"] = months 
chart_df["years"] = years 

  0%|          | 0/47346 [00:00<?, ?it/s]

In [9]:
datetimes = chart_df["datetime"].values 

In [10]:
seq_len = 24
forecast_horizon = 6
date_chart = {} # datetime object : close prices  

for i in tqdm(range(len(datetimes) - seq_len - forecast_horizon), position=0, leave=True): 
    dt_obj = datetime.strptime(str(datetimes[i]), "%Y-%m-%d %H:%M:%S")
    date_chart[dt_obj] = (chart_df["close"].values[i:i+seq_len], chart_df["close"].values[i+seq_len:i+seq_len+forecast_horizon])
    


  0%|          | 0/47316 [00:00<?, ?it/s]

In [62]:
past_cont_inputs, past_dates, future_cont_inputs, future_dates, targets = [], [], [], [], [] 

found = False 

for key, value in tqdm(date_chart.items(), position=0, leave=True): 
    if key in simcse_aug_saved.keys():    
        past_input = [date_chart[key][0]] 
        future_input = [] 
        targets.append(date_chart[key][1])  
    
        similar_dates = simcse_aug_saved[key] 
        for dt in similar_dates:
            past_input.append(date_chart[dt][0]) 
            future_input.append(date_chart[dt][1]) 
        
        past_input = np.array(past_input) 
        future_input = np.array(future_input)  
        
        past_input = past_input.T 
        future_input = future_input.T 
        
        past_cont_inputs.append(past_input) 
        future_cont_inputs.append(future_input) 
        
        past_date = [[key.month, key.day, key.hour]] 
        for i in range(23): 
            curkey = key + timedelta(hours=1) 
            past_date.append([curkey.month, curkey.day, curkey.hour]) 
        past_date = np.array(past_date) 
        past_dates.append(past_date) 
        
        future_date = [] 
        for i in range(6): 
            curkey = curkey + timedelta(hours=1) 
            future_date.append([curkey.month, curkey.day, curkey.hour]) 
        future_date = np.array(future_date) 
        future_dates.append(future_date) 


  0%|          | 0/47316 [00:00<?, ?it/s]

In [65]:
past_cont_inputs = np.array(past_cont_inputs)
past_dates = np.array(past_dates) 
future_cont_inputs = np.array(future_cont_inputs)  
future_dates = np.array(future_dates) 
targets = np.array(targets) 

past_cont_inputs.shape, past_dates.shape, future_cont_inputs.shape, future_dates.shape, targets.shape

((46316, 24, 4), (46316, 24, 3), (46316, 6, 3), (46316, 6, 3), (46316, 6))

In [66]:
np.save("aug_top3_simcse_past_cont_inputs", past_cont_inputs)
np.save("aug_top3_simcse_past_dates", past_dates) 
np.save("aug_top3_simcse_future_cont_inputs", future_cont_inputs) 
np.save("aug_top3_simcse_future_dates", future_dates) 
np.save("aug_top3_simcse_targets", targets) 

### non augmented data

In [79]:
with open("BTC_USDT-1h-12.json") as f: 
        d = json.load(f) 

chart_df = pd.DataFrame(d) 
chart_df = chart_df.rename(columns={0:"timestamp", 1:"open", 2:"high", 3:"low", 4:"close", 5:"volume"})

def process(df): 
        binance = ccxt.binance() 
        dates = df["timestamp"].values 
        timestamp = [] 
        for i in range(len(dates)):
                date_string = binance.iso8601(int(dates[i])) 
                date_string = date_string[:10] + " " + date_string[11:-5] 
                timestamp.append(date_string) 
        df["datetime"] = timestamp
        df = df.drop(columns={"timestamp"}) 
        return df 

chart_df = process(chart_df) 

hours, days, months, years = [],[],[],[] 
for dt in tqdm(chart_df["datetime"]):
        dtobj = pd.to_datetime(dt) 
        hour = dtobj.hour 
        day = dtobj.day 
        month = dtobj.month 
        year = dtobj.year 
        hours.append(hour) 
        days.append(day) 
        months.append(month) 
        years.append(year) 

chart_df["hours"] = hours 
chart_df["days"] = days  
chart_df["months"] = months 
chart_df["years"] = years 


datetimes = chart_df["datetime"].values 

  0%|          | 0/47346 [00:00<?, ?it/s]

In [80]:
seq_len = 24
forecast_horizon = 6
date_chart = {} # datetime object : close prices  

for i in tqdm(range(len(datetimes) - seq_len - forecast_horizon), position=0, leave=True): 
    dt_obj = datetime.strptime(str(datetimes[i]), "%Y-%m-%d %H:%M:%S")
    date_chart[dt_obj] = (chart_df["close"].values[i:i+seq_len], chart_df["close"].values[i+seq_len:i+seq_len+forecast_horizon])
    


  0%|          | 0/47316 [00:00<?, ?it/s]

In [85]:
past_cont_inputs, past_dates, future_dates, targets = [], [], [], [] 
 
for key, value in tqdm(date_chart.items(), position=0, leave=True):
    if key in simcse_aug_saved.keys(): 
        past_cont_inputs.append(date_chart[key][0])  
        targets.append(date_chart[key][1]) 
        
        past_date = [[key.month, key.day, key.hour]] 
        for i in range(23): 
            curkey = key + timedelta(hours=1) 
            past_date.append([curkey.month, curkey.day, curkey.hour]) 
        
        past_date = np.array(past_date) 
        past_dates.append(past_date) 
        
        future_date = [] 
        for i in range(6): 
            curkey = curkey + timedelta(hours=1) 
            future_date.append([curkey.month, curkey.day, curkey.hour]) 
        
        future_date = np.array(future_date) 
        future_dates.append(future_date) 
                

  0%|          | 0/47316 [00:00<?, ?it/s]

In [86]:
past_cont_inputs = np.array(past_cont_inputs).reshape((-1, 24, 1)) 
past_dates = np.array(past_dates) 
future_dates = np.array(future_dates) 
targets = np.array(targets) 

past_cont_inputs.shape, past_dates.shape, future_dates.shape, targets.shape

((46316, 24, 1), (46316, 24, 3), (46316, 6, 3), (46316, 6))

In [102]:
np.save("non_augmented_past_cont_inputs", past_cont_inputs) 
np.save("non_augmented_past_dates", past_dates) 
np.save("non_augmented_future_dates", future_dates) 
np.save("non_augmented_targets", targets) 

In [101]:
past_cont_inputs[11], targets[11]

(array([[4150.05],
        [4128.99],
        [4168.  ],
        [4154.  ],
        [4187.  ],
        [4147.01],
        [4149.99],
        [4129.99],
        [4184.98],
        [4150.  ],
        [4174.5 ],
        [4195.98],
        [4150.  ],
        [4008.01],
        [4032.87],
        [4100.  ],
        [4030.  ],
        [4053.15],
        [4090.1 ],
        [4129.98],
        [4063.01],
        [4145.3 ],
        [4220.01],
        [4247.  ]]),
 array([4158.  , 4146.01, 4143.02, 4200.01, 4158.41, 4140.38]))

In [99]:
xx = np.load("aug_top3_simcse_past_cont_inputs.npy")
xx_targets = np.load("aug_top3_simcse_targets.npy") 

In [100]:
xx[11], xx_targets[11]

(array([[4150.05, 3969.11, 4794.11, 3753.08],
        [4128.99, 3961.  , 4473.35, 3480.  ],
        [4168.  , 3971.02, 4696.44, 3630.06],
        [4154.  , 3915.52, 4697.71, 3630.  ],
        [4187.  , 3860.  , 4628.86, 3674.98],
        [4147.01, 3964.92, 4604.02, 3630.  ],
        [4149.99, 3936.28, 4625.56, 3567.11],
        [4129.99, 3932.02, 4630.01, 3650.  ],
        [4184.98, 3940.2 , 4604.97, 3617.01],
        [4150.  , 4000.  , 4618.39, 3681.  ],
        [4174.5 , 3969.01, 4483.94, 3664.  ],
        [4195.98, 4016.26, 4460.19, 3662.13],
        [4150.  , 3930.  , 4439.  , 3675.  ],
        [4008.01, 3946.37, 4530.01, 3710.  ],
        [4032.87, 3965.01, 4486.06, 3714.95],
        [4100.  , 3961.9 , 4509.77, 3670.  ],
        [4030.  , 3960.01, 4351.19, 3602.  ],
        [4053.15, 3960.  , 4299.53, 3640.02],
        [4090.1 , 3890.01, 4405.  , 3580.  ],
        [4129.98, 3842.04, 4472.14, 3560.01],
        [4063.01, 3848.01, 4600.53, 3557.75],
        [4145.3 , 3882.71, 4560.  