No TA-Lib, so I will have to make functions to calculate technical indicators myself.

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import gresearch_crypto

## Load Data

In [None]:
train_csv = '/kaggle/input/g-research-crypto-forecasting/train.csv'
asset_details_csv = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

In [None]:
df_train = pd.read_csv(train_csv)
df_train.head()

In [None]:
df_train.info()

In [None]:
df_asset_details = pd.read_csv(asset_details_csv)
df_asset_details.info()

In [None]:
df_asset_details = df_asset_details.sort_values("Asset_ID")
df_asset_details

## Missing Data



In [None]:
df_train.isna().sum()

In [None]:
asset_dict = {}
for asset_id in df_asset_details['Asset_ID']:
    asset_dict[asset_id] = df_train[df_train['Asset_ID']==asset_id].dropna()

In [None]:
for key in asset_dict.keys():
    print('Missing Values for Asset '+str(key)+':', asset_dict[key].isna().sum().sum())

In [None]:
for key in asset_dict.keys():
    print('Observations for Asset '+str(key)+':', asset_dict[key].shape[0])

## Feature Engineering

The tutorial notebook has two suggested engineered features: upper_shadow and lower_shadow

These are the distance from the high(low) to the maximum(minumum) of the open or close price, like in a candlestick chart.

We are also going to engineer a daily_range feature, which is the difference between the high and low.

In [None]:
def upper_shadow(df):
    return df['High'] - np.maximum(df['Open'], df['Close'])

def lower_shadow(df):
    return np.minimum(df['Open'], df['Close']) - df['Low']

def daily_range(df):
    return df['High'] - df['Low']

We are going to use technical indicators, and I am going to start with Stochastic Indicator, RSI, ATR, and MACD. These are commonly used indicators, because they have been shown to be reliably useful. The group has two momentum indicators, a volatility indicator, and a trend indicator. Each of these has a hyperparameter that will need to be optimized.

In [None]:
test_df = asset_dict[0]
test_df.head()

In [None]:
def stochastic_indicator(high, low, close, window=14, smooth_window=3):
    """ calculates stochastic oscillator for rolling window, smoothed by smooth_window SMA """
    def stoch_k(high, low, close, window):
        window_high = high.rolling(window).max()
        window_low = low.rolling(window).min()
        return round((close - window_low) / (window_high - window_low) * 100)
    
    def stoch_d(stoch_k, smooth_window):
        return stoch_k.rolling(smooth_window).mean()
    
    k = stoch_k(high, low, close, window)
    return k, stoch_d(k, smooth_window)
        

def add_stochastic_to_df(df, window=14, smooth_window=3):
        df['STOCH'], df['STOCH_signal'] = stochastic_indicator(df['High'], df['Low'], df['Close'], 
                                                               window=window, smooth_window=smooth_window)
        return df.copy()

In [None]:
def rsi(close, window):
    close = close.diff()
    upward_series = pd.Series([np.nan] * len(close), index=close.index)
    upward_series[close > 0] = close[close > 0]
    
    downward_series = pd.Series([np.nan] * len(close), index=close.index)
    downward_series[close < 0] = close[close < 0]
    
    rolling_up = upward_series.rolling(window, min_periods=1).mean()
    rolling_down = downward_series.rolling(window, min_periods=1).mean()
    
    return 100 - ( 100 / ( 1 + (rolling_up / abs(rolling_down))  ) )
    

def add_rsi_to_df(df, window=14):
    df['RSI'] = rsi(df['Close'], window=window)
    return df.copy()

In [None]:
def atr(high, low, close, window):
    high = high.values
    low = low.values
    close = close.values
    
    atr = [max([high[i] - low[i], abs( high[i] - close[i-1] ), abs( low[i] - close[i-1] )]) for i in range(1, len(high))]
    atr.insert(0, high[0] - low[0])
    atr[1:] = [(atr[i-1] * (window - 1) + atr[i]) / window for i in range(1, len(atr))]
    return np.array(atr)

def add_atr_to_df(df, window=14):
    df['ATR'] = atr(df['High'], df['Low'], df['Close'], window=window)
    return df.copy()

In [None]:
def macd(close, window_slow, window_fast, window_signal):
    slow_ewm = close.ewm(span=window_slow, adjust=False).mean()
    fast_ewm = close.ewm(span=window_fast, adjust=False).mean()
    macd = slow_ewm - fast_ewm
    macd_signal = macd.ewm(span=window_signal, adjust=False).mean()
    return macd, macd_signal

def add_macd_to_df(df, window_slow=26, window_fast=12, window_signal=9):
    df['MACD'], df['MACD_signal'] = macd(df['Close'], window_slow, window_fast, window_signal)
    return df.copy()

Now we need a function that processes the data to add the features and separate the targets.

In [None]:
def get_features(df, STOCH_window, STOCH_smooth_window, RSI_window, ATR_window, 
                            MACD_window_slow, MACD_window_fast, MACD_window_sign):
    df_features = df[['Open', 'High', 'Low', 'Close', 'Count', 'Volume', 'VWAP']].copy()
    df_features['upper_shadow'] = upper_shadow(df_features)
    df_features['lower_shadow'] = lower_shadow(df_features)
    df_features['daily_range'] = daily_range(df_features)
    df_features = add_stochastic_to_df(df_features, STOCH_window, STOCH_smooth_window)
    df_features = add_rsi_to_df(df_features, RSI_window)
    df_features = add_atr_to_df(df_features, ATR_window)
    df_features = add_macd_to_df(df_features, MACD_window_slow, MACD_window_fast, MACD_window_sign)
    df_features = df_features.dropna()
    return df_features.copy()

In [None]:
def get_features_and_target(df, STOCH_window, STOCH_smooth_window, RSI_window, ATR_window, 
                            MACD_window_slow, MACD_window_fast, MACD_window_sign):
    df_features = df[['Open', 'High', 'Low', 'Close', 'Count', 'Volume', 'VWAP', 'Target']].copy()
    df_features['upper_shadow'] = upper_shadow(df_features)
    df_features['lower_shadow'] = lower_shadow(df_features)
    df_features['daily_range'] = daily_range(df_features)
    df_features = add_stochastic_to_df(df_features, STOCH_window, STOCH_smooth_window)
    df_features = add_rsi_to_df(df_features, RSI_window)
    df_features = add_atr_to_df(df_features, ATR_window)
    df_features = add_macd_to_df(df_features, MACD_window_slow, MACD_window_fast, MACD_window_sign)
    df_features = df_features.dropna()
    return df_features.copy()

In [None]:
def separate_xy(df, STOCH_window, STOCH_smooth_window, RSI_window, ATR_window, 
                MACD_window_slow, MACD_window_fast, MACD_window_sign):
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    feats = get_features_and_target(df, STOCH_window, STOCH_smooth_window, RSI_window, ATR_window, 
                            MACD_window_slow, MACD_window_fast, MACD_window_sign).copy()
    y = feats['Target'].copy()
    X = feats.drop('Target', axis=1).copy()
    return X, y

In [None]:
def binary_target(y):
    return np.array([1 if i>0 else -1 for i in y])

In [None]:
#def fit_linreg(X,y):
#    model = LinearRegression()
#    model.fit(X,y)
#    return model

In [None]:
def fit_lgbm(X,y):
    model = LGBMRegressor()
    model.fit(X,y)
    return model

## Set Hyperparameters for Indicators

In [None]:
## hyperparameters for technical indicators
STOCH_window = 14
STOCH_smooth_window = 3

RSI_window = 14

ATR_window = 14

MACD_window_slow = 21
MACD_window_fast = 12
MACD_window_sign = 9

## Fit Linear Regression Models

We want to cycle through assets, in the asset_dict, and fit a linear regression models to each.

In [None]:
#Xs = {}
#ys = {}
#models = {}

#for asset_id in asset_dict.keys():
#    print(asset_id, "Fitting Model for ", df_asset_details.loc[df_asset_details['Asset_ID']==asset_id, "Asset_Name"].values)
#    asset_df = asset_dict[asset_id]
#    X, y = separate_xy(asset_df, STOCH_window, STOCH_smooth_window, RSI_window, ATR_window, 
#                       MACD_window_slow, MACD_window_fast, MACD_window_sign)
#    model = fit_linreg(X,y)
#    
#    Xs[asset_id] = X
#    ys[asset_id] = y
#    models[asset_id] = model

## Fit GBM Models

Same as the LinearRegression() models, but using LGBMRegressor().

In [None]:
Xs = {}
ys = {}
avg_changes = {}
models = {}

for asset_id in asset_dict.keys():
    print(asset_id, "Fitting Model for ", df_asset_details.loc[df_asset_details['Asset_ID']==asset_id, "Asset_Name"].values)
    asset_df = asset_dict[asset_id]
    X, y = separate_xy(asset_df, STOCH_window, STOCH_smooth_window, RSI_window, ATR_window, 
                       MACD_window_slow, MACD_window_fast, MACD_window_sign)
    
    Xs[asset_id] = X
    ys[asset_id] = y
    avg_changes[asset_id] = np.mean(abs(y))
    y = binary_target(y)
    model = fit_lgbm(X,y)
    models[asset_id] = model

In [None]:
avg_changes

## Get Predictions and Test Submission

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
for i, (df_test, df_pred) in enumerate(iter_test):
    for j, row in df_test.iterrows():
        model = models[row['Asset_ID']]
        avg_change = avg_changes[row['Asset_ID']]
        
        df = asset_dict[row['Asset_ID']]
        df = df.loc[df['timestamp'] < row['timestamp'], :]
        df = df.append(row.replace([np.inf, -np.inf], np.nan).dropna())
        X_test = get_features(df, STOCH_window, STOCH_smooth_window, RSI_window, ATR_window, 
                            MACD_window_slow, MACD_window_fast, MACD_window_sign)
        y_pred = model.predict(X_test.tail(1))
        y_pred_scaled = y_pred * avg_change
        
        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred_scaled
        
        if i==0 and j==0:
            display(X_test)
            
    if i==0:
        display(df_pred)
        
    env.predict(df_pred)