In [1]:
!pip install pyupbit
!pip install pyjwt

Collecting pyupbit
  Downloading pyupbit-0.2.33-py3-none-any.whl (24 kB)
Collecting websockets (from pyupbit)
  Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: websockets, pyupbit
Successfully installed pyupbit-0.2.33 websockets-12.0


In [3]:
import yfinance as yf
from datetime import datetime, timedelta, timezone
import pyupbit
import pandas as pd
from xgboost import XGBClassifier
from tqdm.auto import tqdm
import json
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

In [4]:
chart_df = pyupbit.get_ohlcv("KRW-BTC", count=3000, interval="day")
chart_df.head()

Unnamed: 0,open,high,low,close,volume,value
2017-09-25 09:00:00,4201000.0,4333000.0,4175000.0,4322000.0,132.484755,560214600.0
2017-09-26 09:00:00,4317000.0,4418000.0,4311000.0,4321000.0,22.78834,99507240.0
2017-09-27 09:00:00,4322000.0,4677000.0,4318000.0,4657000.0,32.269662,144827600.0
2017-09-28 09:00:00,4657000.0,4772000.0,4519000.0,4586000.0,80.588243,372186000.0
2017-09-29 09:00:00,4586000.0,4709000.0,4476000.0,4657000.0,59.352373,272455800.0


In [5]:
chart_df.shape

(2353, 6)

In [6]:
targets = []
close = chart_df["close"].values

for i in range(0, len(close)-1):
  diff = close[i+1] - close[i]
  if diff >= 0:
    targets.append(1)
  elif diff < 0:
    targets.append(0)

In [7]:
len(targets)

2352

In [8]:
targets.append(None) # for the last row.

In [9]:
chart_df["targets"] = targets

In [10]:
chart_df.dropna(inplace=True)

In [11]:
chart_df

Unnamed: 0,open,high,low,close,volume,value,targets
2017-09-25 09:00:00,4201000.0,4333000.0,4175000.0,4322000.0,132.484755,5.602146e+08,0.0
2017-09-26 09:00:00,4317000.0,4418000.0,4311000.0,4321000.0,22.788340,9.950724e+07,1.0
2017-09-27 09:00:00,4322000.0,4677000.0,4318000.0,4657000.0,32.269662,1.448276e+08,0.0
2017-09-28 09:00:00,4657000.0,4772000.0,4519000.0,4586000.0,80.588243,3.721860e+08,1.0
2017-09-29 09:00:00,4586000.0,4709000.0,4476000.0,4657000.0,59.352373,2.724558e+08,1.0
...,...,...,...,...,...,...,...
2024-02-28 09:00:00,78621000.0,88424000.0,78082000.0,87634000.0,19254.097473,1.605842e+12,0.0
2024-02-29 09:00:00,87695000.0,90000000.0,85244000.0,85910000.0,14269.126844,1.247892e+12,1.0
2024-03-01 09:00:00,85911000.0,88500000.0,85910000.0,87397000.0,6256.971936,5.457934e+11,0.0
2024-03-02 09:00:00,87397000.0,87724000.0,86090000.0,86383000.0,5481.662532,4.758362e+11,1.0


In [12]:
# data prepocessing
days, months = [], []
for dt in tqdm(chart_df.index):
  day = pd.to_datetime(dt).day
  month = pd.to_datetime(dt).month
  days.append(day)
  months.append(month)

chart_df["day"] = days
chart_df["month"] = months

chart_df.head()

  0%|          | 0/2352 [00:00<?, ?it/s]

Unnamed: 0,open,high,low,close,volume,value,targets,day,month
2017-09-25 09:00:00,4201000.0,4333000.0,4175000.0,4322000.0,132.484755,560214600.0,0.0,25,9
2017-09-26 09:00:00,4317000.0,4418000.0,4311000.0,4321000.0,22.78834,99507240.0,1.0,26,9
2017-09-27 09:00:00,4322000.0,4677000.0,4318000.0,4657000.0,32.269662,144827600.0,0.0,27,9
2017-09-28 09:00:00,4657000.0,4772000.0,4519000.0,4586000.0,80.588243,372186000.0,1.0,28,9
2017-09-29 09:00:00,4586000.0,4709000.0,4476000.0,4657000.0,59.352373,272455800.0,1.0,29,9


In [13]:
# add technical indicators
delta = chart_df["close"].diff()
gain = (delta.where(delta > 0, 0)).fillna(0)
loss = (-delta.where(delta < 0, 0)).fillna(0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
chart_df['RSI'] = 100 - (100 / (1 + rs))

chart_df['SMA_20'] = chart_df['close'].rolling(window=20).mean()
chart_df['STD_20'] = chart_df['close'].rolling(window=20).std()
chart_df['Upper_Bollinger'] = chart_df['SMA_20'] + (chart_df['STD_20'] * 2)
chart_df['Lower_Bollinger'] = chart_df['SMA_20'] - (chart_df['STD_20'] * 2)
short_ema = chart_df['close'].ewm(span=12, adjust=False).mean()
long_ema = chart_df['close'].ewm(span=26, adjust=False).mean()
chart_df['MACD'] = short_ema - long_ema
chart_df['Signal'] = chart_df['MACD'].ewm(span=9, adjust=False).mean()
low_14 = chart_df['low'].rolling(window=14).min()
high_14 = chart_df['high'].rolling(window=14).max()
chart_df['%K'] = 100 * ((chart_df['close'] - low_14) / (high_14 - low_14))
chart_df['%D'] = chart_df['%K'].rolling(window=3).mean()

In [15]:
### addition of recent differenced features ###
for l in tqdm(range(1, 4), position=0, leave=True):
  for col in ["high", "low", "volume"]:
    val = chart_df[col].values
    val_ret = [None for _ in range(l)]
    for i in range(l, len(val)):
      if val[i-l] == 0:
        ret = 1
      else:
        ret = val[i] / val[i-l]
      val_ret.append(ret)
    chart_df["{}_change_{}".format(col, l)] = val_ret

  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
chart_df.dropna(inplace=True)

In [17]:
chart_df

Unnamed: 0,open,high,low,close,volume,value,targets,day,month,RSI,...,%D,high_change_1,low_change_1,volume_change_1,high_change_2,low_change_2,volume_change_2,high_change_3,low_change_3,volume_change_3
2017-10-14 09:00:00,6427000.0,6664000.0,6322000.0,6664000.0,3.352825,2.171336e+07,0.0,14,10,93.592903,...,87.584014,0.958849,1.028135,0.124984,1.045005,1.146328,0.160282,1.188938,1.159787,0.795894
2017-10-15 09:00:00,6610000.0,6700000.0,6142000.0,6381000.0,7.630783,4.860637e+07,1.0,15,10,81.603563,...,79.506001,1.005402,0.971528,2.275927,0.964029,0.998862,0.284454,1.050651,1.113690,0.364790
2017-10-16 09:00:00,6403000.0,6527000.0,6282000.0,6491000.0,22.926635,1.462712e+08,0.0,16,10,81.980803,...,79.776643,0.974179,1.022794,3.004493,0.979442,0.993673,6.838006,0.939137,1.021630,0.854640
2017-10-17 09:00:00,6495000.0,6510000.0,6299000.0,6457000.0,4.241337,2.726041e+07,0.0,17,10,82.958057,...,76.297335,0.997395,1.002706,0.184996,0.971642,1.025562,0.555819,0.976891,0.996362,1.265004
2017-10-18 09:00:00,6413000.0,6489000.0,5876000.0,6391000.0,36.001119,2.192088e+08,1.0,18,10,83.068081,...,76.453171,0.996774,0.932846,8.488153,0.994178,0.935371,1.570275,0.968507,0.956692,4.717880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-28 09:00:00,78621000.0,88424000.0,78082000.0,87634000.0,19254.097473,1.605842e+12,0.0,28,2,89.209733,...,96.148455,1.116704,1.047321,1.893473,1.178987,1.115457,3.129461,1.235058,1.101189,10.310489
2024-02-29 09:00:00,87695000.0,90000000.0,85244000.0,85910000.0,14269.126844,1.247892e+12,1.0,29,2,81.997992,...,90.401869,1.017823,1.091724,0.741096,1.136608,1.143386,1.403244,1.200000,1.217771,2.319230
2024-03-01 09:00:00,85911000.0,88500000.0,85910000.0,87397000.0,6256.971936,5.457934e+11,0.0,1,3,82.466511,...,87.667853,0.983333,1.007813,0.438497,1.000859,1.100254,0.324968,1.117664,1.152319,0.615319
2024-03-02 09:00:00,87397000.0,87724000.0,86090000.0,86383000.0,5481.662532,4.758362e+11,1.0,2,3,81.032970,...,83.038072,0.991232,1.002095,0.876089,0.974711,1.009924,0.384162,0.992084,1.102559,0.284701


In [18]:
columns = chart_df.columns

train_cols = []

for col in columns:
  if col != "targets":
    train_cols.append(col)

In [19]:
X = chart_df[train_cols]
y = chart_df["targets"]

In [20]:
xgb_clf = XGBClassifier(eval_metric="logloss")

xgb_clf.fit(X, y)

In [22]:
xgb_clf.predict(X)

array([0, 1, 0, ..., 0, 1, 1])

In [24]:
# save model
xgb_clf.save_model("xgb_clf_mainlanding")



In [25]:
# load and infer
loaded_model = XGBClassifier()
loaded_model.load_model("/content/xgb_clf_mainlanding")

In [27]:
loaded_model.predict(X)

array([0, 1, 0, ..., 0, 1, 1])

In [35]:
# inference example
def preprocess_function(chart_df):
  days, months = [], []
  for dt in tqdm(chart_df.index):
    day = pd.to_datetime(dt).day
    month = pd.to_datetime(dt).month
    days.append(day)
    months.append(month)
  chart_df["day"] = days
  chart_df["month"] = months

  delta = chart_df["close"].diff()
  gain = (delta.where(delta > 0, 0)).fillna(0)
  loss = (-delta.where(delta < 0, 0)).fillna(0)
  avg_gain = gain.rolling(window=14).mean()
  avg_loss = loss.rolling(window=14).mean()
  rs = avg_gain / avg_loss
  chart_df['RSI'] = 100 - (100 / (1 + rs))

  chart_df['SMA_20'] = chart_df['close'].rolling(window=20).mean()
  chart_df['STD_20'] = chart_df['close'].rolling(window=20).std()
  chart_df['Upper_Bollinger'] = chart_df['SMA_20'] + (chart_df['STD_20'] * 2)
  chart_df['Lower_Bollinger'] = chart_df['SMA_20'] - (chart_df['STD_20'] * 2)
  short_ema = chart_df['close'].ewm(span=12, adjust=False).mean()
  long_ema = chart_df['close'].ewm(span=26, adjust=False).mean()
  chart_df['MACD'] = short_ema - long_ema
  chart_df['Signal'] = chart_df['MACD'].ewm(span=9, adjust=False).mean()
  low_14 = chart_df['low'].rolling(window=14).min()
  high_14 = chart_df['high'].rolling(window=14).max()
  chart_df['%K'] = 100 * ((chart_df['close'] - low_14) / (high_14 - low_14))
  chart_df['%D'] = chart_df['%K'].rolling(window=3).mean()

  for l in tqdm(range(1, 4), position=0, leave=True):
    for col in ["high", "low", "volume"]:
      val = chart_df[col].values
      val_ret = [None for _ in range(l)]
      for i in range(l, len(val)):
        if val[i-l] == 0:
          ret = 1
        else:
          ret = val[i] / val[i-l]
        val_ret.append(ret)
      chart_df["{}_change_{}".format(col, l)] = val_ret

  chart_df.dropna(inplace=True)
  return chart_df

In [36]:
test_df = pyupbit.get_ohlcv("KRW-BTC", count=50, interval="day")

In [37]:
test_df = preprocess_function(test_df)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [38]:
test_input = test_df.iloc[-2].values.reshape((1,-1))

In [41]:
xgb_prob = loaded_model.predict_proba(test_input)[0]
xgb_prob

array([0.07067788, 0.9293221 ], dtype=float32)