In [46]:
import numpy as np 
import pandas as pd 
import json 
import ccxt 
import seaborn as sns
import os 
import pandas_ta as ta 
import time
from datetime import datetime, timedelta
import math
from tqdm.auto import tqdm 
import matplotlib.pyplot as plt 
from transformers import * 
import torch 
from torch import Tensor 
from torch.utils.data import * 
import torch.nn as nn 
import torch.nn.functional as F 
from sklearn.utils.class_weight import compute_class_weight 
from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
from pytorch_metric_learning import miners, losses
from pytorch_metric_learning.distances import CosineSimilarity
from scipy.spatial.distance import cdist 
import random 
import warnings 
warnings.filterwarnings("ignore") 
import pickle 
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier  
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder
from ts2vec import TS2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
def preprocess_seq_data(chart_df, threshold=0.0075, lookback=16): 
    targets = [] 
    openv = chart_df["open"].values 
    close = chart_df["close"].values 
    high = chart_df["high"].values 
    low = chart_df["low"].values  
    volume = chart_df["volume"].values 
    
    for i in range(close.shape[0]-1):
        high_vol = (high[i+1] - close[i]) / close[i] 
        low_vol = (low[i+1] - close[i]) / close[i] 
        if high_vol >= threshold: 
            targets.append(0) 
        elif low_vol <= -threshold:
            targets.append(1) 
        else:
            targets.append(2) 
    targets.append(None) 
    chart_df["Targets"] = targets 
    
    chart_df.set_index(pd.DatetimeIndex(chart_df["datetime"]), inplace=True)
    chart_df["bop"] = chart_df.ta.bop(lookahead=False) 
    chart_df["cmf"] = chart_df.ta.cmf(lookahead=False) 
    
    chart_df["high/low"] = chart_df["high"] / chart_df["low"] 
    chart_df["high/open"] = chart_df["high"] / chart_df["open"] 
    chart_df["low/open"] = chart_df["low"] / chart_df["open"] 
    chart_df["close/open"] = chart_df["close"] / chart_df["open"] 
    chart_df["high/close"] = chart_df["high"] / chart_df["close"] 
    chart_df["low/close"] = chart_df["low"] / chart_df["close"]     
    
    for l in range(1, lookback): 
        for col in ["open", "high", "low", "close", "volume"]:
            val = chart_df[col].values 
            val_ret = [None for _ in range(l)]
            for i in range(l, len(val)):
                if val[i-l] == 0: 
                    ret = 1 
                else:
                    ret = val[i] / val[i-l]  
                val_ret.append(ret) 
            chart_df["{}_change_{}".format(col, l)] = val_ret 

    chart_df.dropna(inplace=True) 
    chart_df.drop(columns={"open", "high", "low", "close", "volume"}, inplace=True) 
    return chart_df 


In [3]:
accuracies, f1s = [], [] 

for lookback in tqdm(range(16, 17)): 
    df = pd.read_csv("updated.csv") 
    df = preprocess_seq_data(df, threshold=0.0075, lookback=lookback)
    train_columns = []
    for col in df.columns:
        if col not in ["Targets", "datetime", "years"]:
            train_columns.append(col)  

    X = df[train_columns] 
    Y = df["Targets"] 
    
    train_size = int(df.shape[0] * 0.8) 
    val_size = int(df.shape[0] * 0.1) 

    X_train = X.iloc[:train_size] 
    Y_train = Y.iloc[:train_size] 

    X_val = X.iloc[train_size:train_size+val_size] 
    Y_val = Y.iloc[train_size:train_size+val_size] 

    X_test = X.iloc[train_size+val_size:] 
    Y_test = Y.iloc[train_size+val_size:] 
    
    print(X_train.shape, X_val.shape, X_test.shape)
        
    d = compute_class_weight(class_weight="balanced", classes=np.unique(Y_train), y=Y_train) 
    clf = XGBClassifier(silent=False, 
                        n_estimators=200,
                        class_weight=d, 
                        metric="logloss",
                        tree_method="gpu_hist", 
                        max_depth=3)
    clf.fit(X_train, 
            Y_train, 
            eval_set=[(X_val, Y_val)],
            eval_metric="auc",
            verbose=0)

    Y_pred = clf.predict(X_test)
    cnt = 0 
    for i in range(len(Y_pred)): 
        if Y_pred[i] == Y_test[i]: 
            cnt += 1 

    accuracy = cnt / len(Y_pred) * 100
    f1 = f1_score(Y_test, Y_pred, average="macro") 

    print(f"lookback: {lookback}")
    print(f"accuracy : {accuracy}% | Macro F1 : {f1}")  
    print("="*100) 

  0%|          | 0/1 [00:00<?, ?it/s]

(9608, 86) (1201, 86) (1202, 86)
Parameters: { "class_weight", "metric", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


lookback: 16
accuracy : 54.0765391014975% | Macro F1 : 0.4915711345122425


In [4]:
clf.save_model("XGBoost_54_49") 

In [5]:
# example load 

clf_load = XGBClassifier() 
clf_load.load_model("XGBoost_54_49") 