In [8]:
import numpy as np 
import pandas as pd 
import random 
import os 
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt 
from tqdm import tqdm
import time
import datetime
import pybit
import ccxt
import telegram 
import math
import seaborn as sns
import torch 
import torch.nn as nn 
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

### for tabnet ### 
from pytorch_tabnet.tab_model import TabNetClassifier

### optuna ### 
import optuna
from optuna import Trial, visualization

In [19]:
eth2019 = pd.read_csv("2019_ETHUSDT.csv") 
eth2020 = pd.read_csv("2020_ETHUSDT.csv")  
eth2021 = pd.read_csv("2021_ETHUSDT.csv") 
eth2022 = pd.read_csv("2022_ETHUSDT.csv") 

df = pd.concat([eth2019,eth2020,eth2021,eth2022], axis=0) 
df.index = np.arange(df.shape[0]) 

df = df.drop(columns={'0'})
df = df[df['Volume'] > 0] # get rid of rows with zero volume 

In [20]:
def create_timestamps(df, ccxt_bybit): 
    dates = df['Open Time'].values 
    ccxt_bybit = ccxt.bybit() 
    timestamp = [] 
    for i in range(len(dates)): 
        date_string = ccxt_bybit.iso8601(int(dates[i]))  
        date_string = date_string[:10] + " " + date_string[11:-5] 
        timestamp.append(date_string)  
    df['timestamp'] = timestamp 
    df['timestamp'] = pd.to_datetime(df['timestamp']) 
    return df 

In [21]:
def rsi_calc(df, period): 
    df = df['Close'].astype(float) 
    delta = df.diff() 
    gains, declines = delta.copy(), delta.copy() 
    gains[gains < 0] = 0 
    declines[declines > 0] = 0 
    _gains = gains.ewm(com=(period-1), min_periods=period).mean() 
    _loss = declines.abs().ewm(com=(period-1), min_periods=period).mean() 
    RS = _gains / _loss
    return pd.Series((100 - (100 / (1+RS)))/100, name="Scaled_RSI")

In [22]:
def preprocess_df(df, ccxt_bybit): 
    df = create_timestamps(df, ccxt_bybit)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['Hour'] = df['timestamp'].apply(lambda x: x.hour)
    df['Weekday'] = df['timestamp'].apply(lambda x: x.weekday())
    df['Day'] = df['timestamp'].apply(lambda x: x.day)
    df['Month'] = df['timestamp'].apply(lambda x: x.month)
    
    df['vwap'] = (df['Volume'] * (df['High'] + df['Low'])/2).cumsum() / df['Volume'].cumsum() 
    vwap = df['vwap'].values 
    vwap_ratio = [None] 
    for i in range(1, len(vwap)): 
        ratio = vwap[i] / vwap[i-1] 
        vwap_ratio.append(ratio) 
    df['vwap_ratio'] = vwap_ratio 

    df['RSI'] = rsi_calc(df, period=14)
    
    eps = 1e-10 ### to avoid division by zero for Volume calculations ### 
    
    for window in [5,10,20,30,50,100]: 
        df['close_ma{}'.format(window)] = df['Close'].rolling(window).mean() 
        df['volume_ma{}'.format(window)] = df['Volume'].rolling(window).mean()
        df['close_ma{}_ratio'.format(window)] = (df['Close'] - df['close_ma{}'.format(window)]) / df['close_ma{}'.format(window)] 
        df['volume_ma{}_ratio'.format(window)] = (df['Volume'] - df['volume_ma{}'.format(window)]) / (df['volume_ma{}'.format(window)]+eps)
    
    open_prices = df['Open'].values 
    close_prices = df['Close'].values 
    volumes = df['Volume'].values 
    
    ### strides and differencing ###
    stride_colnames = [] 
    strides = 121
    for i in range(2, strides): 
        df['close_{}apartclose_ratio'.format(i)] = np.zeros((df.shape[0])) 
        stride_colnames.append('close_{}apartclose_ratio'.format(i)) 
    for i in tqdm(range(strides,df.shape[0]), position=0, leave=True):
        ret = [] 
        for j in range(2,strides):
            ratio = close_prices[i] / close_prices[i-j] 
            df['close_{}apartclose_ratio'.format(j)].iloc[i] = ratio  
            
    for i in range(2, strides):  
        df['volume_{}apartvolume_ratio'.format(i)] = np.zeros((df.shape[0])) 
        stride_colnames.append('volume_{}apartvolume_ratio'.format(i)) 
    for i in tqdm(range(strides,df.shape[0]), position=0, leave=True):
        ret = [] 
        for j in range(2,strides): 
            ratio = volumes[i] / volumes[i-j] 
            df['volume_{}apartvolume_ratio'.format(j)].iloc[i] = ratio 
        
    labels = [] 
    for i in range(len(close_prices)-1): 
        ret = close_prices[i+1] / close_prices[i] 
        if ret > 1.0:  
            labels.append(1) 
        elif ret <= 1.0: 
            labels.append(0) 
    labels.append(None)
    df['Labels'] = labels 
    
    df['high_close_ratio'] = (df['High'].values - df['Close'].values) / df['Close'].values 
    df['low_close_ratio'] = (df['Low'].values - df['Close'].values) / df['Close'].values  
    
    close_lastclose_ratio = [None] 
    for i in range(1, len(close_prices)): 
        ratio = close_prices[i] / close_prices[i-1] 
        close_lastclose_ratio.append(ratio) 
    df['close_lastclose_ratio'] = close_lastclose_ratio 
    
    volume_lastvolume_ratio = [None] 
    for i in range(1, len(volumes)):
        ratio = volumes[i] / (volumes[i-1]+eps) 
        volume_lastvolume_ratio.append(ratio) 
    df['volume_lastvolume_ratio'] = volume_lastvolume_ratio
    
    cols = ["Hour",
            "Day", 
            "Month",
            "Weekday", 
            "close_ma5_ratio", 
            "volume_ma5_ratio",
            "close_ma10_ratio", 
            "volume_ma10_ratio",
            "close_ma20_ratio", 
            "volume_ma20_ratio", 
            "close_ma30_ratio", 
            "volume_ma30_ratio", 
            "close_ma50_ratio", 
            "volume_ma50_ratio",
            "close_ma100_ratio",
            "volume_ma100_ratio", 
            "high_close_ratio",
            "low_close_ratio", 
            "close_lastclose_ratio", 
            "volume_lastvolume_ratio",
            "vwap_ratio", 
            "Labels"] 
    
    cols = cols + stride_colnames 
    df = df[cols] # specify feature columns  
    return df

ccxt_bybit = ccxt.bybit() 

df = preprocess_df(df, ccxt_bybit) 

df = df.dropna() 

df.to_csv("feature_engineered_larger_ethusdt.csv",index=False)

df.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 26857/26857 [10:19<00:00, 43.33it/s]
100%|██████████| 26857/26857 [11:27<00:00, 39.09it/s]


Unnamed: 0,Hour,Day,Month,Weekday,close_ma5_ratio,volume_ma5_ratio,close_ma10_ratio,volume_ma10_ratio,close_ma20_ratio,volume_ma20_ratio,...,volume_111apartvolume_ratio,volume_112apartvolume_ratio,volume_113apartvolume_ratio,volume_114apartvolume_ratio,volume_115apartvolume_ratio,volume_116apartvolume_ratio,volume_117apartvolume_ratio,volume_118apartvolume_ratio,volume_119apartvolume_ratio,volume_120apartvolume_ratio
99,2,5,1,5,0.01317,-0.311798,0.033473,-0.090164,0.042926,-0.044944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,3,5,1,5,0.006966,-0.523102,0.027567,-0.400415,0.042274,-0.352542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,4,5,1,5,-0.003023,-0.478648,0.017314,-0.371902,0.03657,-0.274155,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,4,5,1,5,-0.002034,-0.286171,0.011617,-0.385466,0.034279,-0.273873,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,5,5,1,5,0.002237,-0.170133,0.011619,-0.470016,0.036758,-0.368733,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
df = pd.read_csv("feature_engineered_larger_ethusdt.csv")

In [3]:
df = df.iloc[22:,:]

In [4]:
cols = df.columns 
train_cols, targets = [], [] 
for col in cols: 
    if col == "Labels": 
        targets.append(col)
    else: 
        train_cols.append(col) 

In [5]:
X = df[train_cols] 
Y = df[targets] 

X = X.values
Y = Y.values.reshape((-1)) 

X.shape, Y.shape

((26856, 259), (26856,))

In [11]:
def Objective(trial): 
    mask_type = trial.suggest_categorical('mask_type', ["entmax", "sparsemax"]) 
    n_da = trial.suggest_int("n_da", 8, 64, step=4)
    n_steps = trial.suggest_int("n_steps", 3, 10, step=1) 
    gamma = trial.suggest_float("gamma", 1.0, 2.0, step = 0.2) 
    n_shared = trial.suggest_int("n_shared", 1, 5) 
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True) 
    
    tabnet_params = dict(n_d = n_da, 
                         n_a = n_da, 
                         n_steps = n_steps, 
                         gamma = gamma, 
                         lambda_sparse = lambda_sparse, 
                         optimizer_fn = torch.optim.Adam,
                         optimizer_params = dict(lr=2e-2, weight_decay=1e-5), 
                         mask_type = mask_type,
                         n_shared = n_shared, 
                         scheduler_params = dict(mode="min", 
                                                 patience = trial.suggest_int("patienceScheduler", low = 3, high = 10),
                                                 min_lr = 1e-5,
                                                 factor = 0.5),
                         scheduler_fn = torch.optim.lr_scheduler.ReduceLROnPlateau, 
                         verbose = 1) 
    
    
    kf = KFold(n_splits = 5, random_state = 42, shuffle = True)
    cv_score_array = [] 
    for (train_idx, val_idx) in kf.split(X): 
        X_train, X_val = X[train_idx], X[val_idx] 
        Y_train, Y_val = Y[train_idx], Y[val_idx] 
        clf = TabNetClassifier(**tabnet_params) 
        clf.fit(X_train = X_train, 
                y_train = Y_train, 
                eval_set = [(X_val, Y_val)], 
                patience = trial.suggest_int("patience", low=20, high=60), max_epochs = trial.suggest_int('epochs',100,1000), 
                eval_metric = ["auc", "accuracy"], 
                drop_last = True) 
        cv_score_array.append(clf.best_cost) 
    avg = np.mean(cv_score_array) 
    return avg 

In [12]:
study = optuna.create_study(direction = "maximize", study_name = "TabNet_optimization_clf_2") 

study.optimize(Objective, timeout = 6*60)

[32m[I 2022-01-10 14:28:08,038][0m A new study created in memory with name: TabNet_optimization_clf_2[0m


Device used : cuda
epoch 0  | loss: 0.92402 | val_0_auc: 0.48995 | val_0_accuracy: 0.50242 |  0:00:08s
epoch 1  | loss: 0.71754 | val_0_auc: 0.497   | val_0_accuracy: 0.49721 |  0:00:16s
epoch 2  | loss: 0.69343 | val_0_auc: 0.50164 | val_0_accuracy: 0.50354 |  0:00:24s
epoch 3  | loss: 0.69425 | val_0_auc: 0.49786 | val_0_accuracy: 0.50112 |  0:00:32s
epoch 4  | loss: 0.69174 | val_0_auc: 0.50469 | val_0_accuracy: 0.50335 |  0:00:40s
epoch 5  | loss: 0.68944 | val_0_auc: 0.5034  | val_0_accuracy: 0.49814 |  0:00:48s
epoch 6  | loss: 0.68927 | val_0_auc: 0.50415 | val_0_accuracy: 0.50819 |  0:00:56s
epoch 7  | loss: 0.68867 | val_0_auc: 0.49842 | val_0_accuracy: 0.49684 |  0:01:03s
epoch 8  | loss: 0.68765 | val_0_auc: 0.50264 | val_0_accuracy: 0.49274 |  0:01:11s
epoch 9  | loss: 0.68707 | val_0_auc: 0.50098 | val_0_accuracy: 0.4959  |  0:01:19s
epoch 10 | loss: 0.68687 | val_0_auc: 0.50611 | val_0_accuracy: 0.50633 |  0:01:26s
epoch 11 | loss: 0.68622 | val_0_auc: 0.50378 | val_0_acc

epoch 98 | loss: 0.68108 | val_0_auc: 0.54937 | val_0_accuracy: 0.53816 |  0:12:42s


KeyboardInterrupt: 

In [None]:
TabNet_params = study.best_params 
TabNet_params 

In [6]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.1, random_state = 888) 

X_train.shape, X_val.shape, Y_train.shape, Y_val.shape

((24170, 259), (2686, 259), (24170,), (2686,))