In [1]:
!pip install pyupbit
!pip install pyjwt

Collecting pyupbit
  Downloading pyupbit-0.2.33-py3-none-any.whl (24 kB)
Collecting websockets (from pyupbit)
  Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: websockets, pyupbit
Successfully installed pyupbit-0.2.33 websockets-12.0


In [12]:
import yfinance as yf
from datetime import datetime, timedelta, timezone
import pyupbit
import pandas as pd
from xgboost import XGBClassifier
from tqdm.auto import tqdm
import json
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
import lightgbm as lgb

In [13]:
chart_df = pyupbit.get_ohlcv("KRW-BTC", count=3000, interval="day")
chart_df.head()

Unnamed: 0,open,high,low,close,volume,value
2017-09-25 09:00:00,4201000.0,4333000.0,4175000.0,4322000.0,132.484755,560214600.0
2017-09-26 09:00:00,4317000.0,4418000.0,4311000.0,4321000.0,22.78834,99507240.0
2017-09-27 09:00:00,4322000.0,4677000.0,4318000.0,4657000.0,32.269662,144827600.0
2017-09-28 09:00:00,4657000.0,4772000.0,4519000.0,4586000.0,80.588243,372186000.0
2017-09-29 09:00:00,4586000.0,4709000.0,4476000.0,4657000.0,59.352373,272455800.0


In [14]:
targets = []
close = chart_df["close"].values

for i in range(0, len(close)-1):
  diff = close[i+1] - close[i]
  if diff >= 0:
    targets.append(1)
  elif diff < 0:
    targets.append(0)

targets.append(None) # for the last row.
chart_df["targets"] = targets
chart_df.dropna(inplace=True)


In [15]:
# inference example
def preprocess_function(chart_df):
  days, months = [], []
  for dt in tqdm(chart_df.index):
    day = pd.to_datetime(dt).day
    month = pd.to_datetime(dt).month
    days.append(day)
    months.append(month)
  chart_df["day"] = days
  chart_df["month"] = months

  delta = chart_df["close"].diff()
  gain = (delta.where(delta > 0, 0)).fillna(0)
  loss = (-delta.where(delta < 0, 0)).fillna(0)
  avg_gain = gain.rolling(window=14).mean()
  avg_loss = loss.rolling(window=14).mean()
  rs = avg_gain / avg_loss
  chart_df['RSI'] = 100 - (100 / (1 + rs))

  chart_df['SMA_20'] = chart_df['close'].rolling(window=20).mean()
  chart_df['STD_20'] = chart_df['close'].rolling(window=20).std()
  chart_df['Upper_Bollinger'] = chart_df['SMA_20'] + (chart_df['STD_20'] * 2)
  chart_df['Lower_Bollinger'] = chart_df['SMA_20'] - (chart_df['STD_20'] * 2)
  short_ema = chart_df['close'].ewm(span=12, adjust=False).mean()
  long_ema = chart_df['close'].ewm(span=26, adjust=False).mean()
  chart_df['MACD'] = short_ema - long_ema
  chart_df['Signal'] = chart_df['MACD'].ewm(span=9, adjust=False).mean()
  low_14 = chart_df['low'].rolling(window=14).min()
  high_14 = chart_df['high'].rolling(window=14).max()
  chart_df['%K'] = 100 * ((chart_df['close'] - low_14) / (high_14 - low_14))
  chart_df['%D'] = chart_df['%K'].rolling(window=3).mean()

  for l in tqdm(range(1, 4), position=0, leave=True):
    for col in ["high", "low", "volume"]:
      val = chart_df[col].values
      val_ret = [None for _ in range(l)]
      for i in range(l, len(val)):
        if val[i-l] == 0:
          ret = 1
        else:
          ret = val[i] / val[i-l]
        val_ret.append(ret)
      chart_df["{}_change_{}".format(col, l)] = val_ret

  chart_df.dropna(inplace=True)
  return chart_df


In [16]:
processed_df = preprocess_function(chart_df)

  0%|          | 0/2352 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
columns = processed_df.columns

train_cols = []

for col in columns:
  if col != "targets":
    train_cols.append(col)

In [19]:
X = processed_df[train_cols]
y = processed_df["targets"]

In [20]:
# Create the LightGBM dataset
train_data = lgb.Dataset(X, label=y)

# Define the parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
gbm = lgb.train(params, train_data, num_boost_round=100)

# Save the model
gbm.save_model('lightgbm_model.txt')

[LightGBM] [Info] Number of positive: 1217, number of negative: 1116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6165
[LightGBM] [Info] Number of data points in the train set: 2333, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.521646 -> initscore=0.086638
[LightGBM] [Info] Start training from score 0.086638


<lightgbm.basic.Booster at 0x7a717bbca5c0>

In [25]:
test_lgb = lgb.Booster(model_file="/content/lightgbm_model.txt")

y_pred = test_lgb.predict(X, num_iteration=test_lgb.best_iteration)

In [26]:
y_pred

array([0.48690783, 0.72025352, 0.37030449, ..., 0.25786104, 0.73564255,
       0.70140505])

In [24]:
# inference
1 - y_pred[-1], y_pred[-1]

(0.29859495289369065, 0.7014050471063094)

In [29]:
test_df = pyupbit.get_ohlcv("KRW-BTC", count=50, interval="day")

test_df = preprocess_function(test_df)

test_input = test_df.iloc[-2].values.reshape((1,-1))

test_lgb.predict(test_input, num_iteration=test_lgb.best_iteration)[0]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0.7014050471063094