In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_folder = "/kaggle/input/g-research-crypto-forecasting/"

In [None]:
crypto_df = pd.read_csv(data_folder + 'train.csv')

In [None]:
crypto_df.head(10)

# Exploratory Data Analysis

The test data is contained in the original train data , so the LB score of 0.313 is overfitting.

For more information:
* __[Watch out!: test LB period is contained in the train csv](https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/285505) (topic)__
* __[G-Research- Using the overlap fully [LB=0.99]](https://www.kaggle.com/julian3833/g-research-using-the-overlap-fully-lb-0-99) (notebook)__
* __[Meaningful submission scores / sharing the lower boundary of public test data](https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/285289) (topic)__


So we need to check and exclude the test data from the train data

In [None]:
test_df = pd.read_csv(data_folder + 'example_test.csv')
test_df.head(10)

In [None]:
#Before excluding test data
print ("ACTUAL TRAIN DATA")

print ("Time stamp the train data set ends to: "+ str(pd.to_datetime(crypto_df['timestamp'], unit='s').min()))
print ("Time stamp the train data set ends to: "+ str(pd.to_datetime(crypto_df['timestamp'], unit='s').max()), end='\n\n')

print ("TEST DATA")

print ("Time stamp the test data set starts from: "+ str(pd.to_datetime(test_df['timestamp'], unit='s').min()))
print ("Time stamp the test data set ends to: "+ str(pd.to_datetime(test_df['timestamp'], unit='s').max()), end='\n\n')

Now this symbolize the test data is part of train data set.
So we need to exclude data from train dataset on after 2021-06-13 00:00:00

In [None]:
if True:
    crypto_df_train = crypto_df[crypto_df['timestamp'] < test_df['timestamp'].min()]

In [None]:
print ("ACTUAL TRAIN DATA")
print ("Time stamp the actual train data set starts from: "+ str(pd.to_datetime(crypto_df['timestamp'], unit='s').min()))
print ("Time stamp the actual train data set ends to: "+ str(pd.to_datetime(crypto_df['timestamp'], unit='s').max()), end='\n\n')
print ("DERIVED TRAIN DATA")
print ("Time stamp the derived train data set starts from: "+ str(pd.to_datetime(crypto_df_train['timestamp'], unit='s').min()))
print ("Time stamp the derived train data set ends to: "+ str(pd.to_datetime(crypto_df_train['timestamp'], unit='s').max()), end = '\n\n')
print ("TEST DATA")
print ("Time stamp the test data set starts from: "+ str(pd.to_datetime(test_df['timestamp'], unit='s').min()))
print ("Time stamp the test data set ends to: "+ str(pd.to_datetime(test_df['timestamp'], unit='s').max()), end = '\n\n')


In [None]:
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib.dates import date2num

In [None]:

fig, ax = plt.subplots(figsize=(30,1))

ax.plot(datetime(2021,6,12),1)
ax.axvspan(date2num(datetime(2021,6,12)), date2num(datetime(2021,6,14)), 
           label="ACTUAL TRAIN DATA",color="yellow", alpha=0.5)

ax.axvspan(date2num(datetime(2021,6,12)), date2num(pd.to_datetime(crypto_df_train['timestamp'], unit='s').max()), 
           label="DERIVED TRAIN DATA",color="green", alpha=0.3)

ax.axvspan(date2num(pd.to_datetime(test_df['timestamp'], unit='s').min()), date2num(pd.to_datetime(test_df['timestamp'], unit='s').max()), 
           label="TEST DATA",color="red", alpha=1.0)

ax.legend()

ax.set_title('Data Distribution Over Time', size=18)


The **RED** line in the above graph is the test data and if we would have considered the whole train dataset it will lead to overfit.

# Data features
We can see the different features included in the dataset. Specifically, the features included per asset are the following:
*   **timestamp**: All timestamps are returned as second Unix timestamps (the number of seconds elapsed since 1970-01-01 00:00:00.000 UTC). Timestamps in this dataset are multiple of 60, indicating minute-by-minute data.
*   **Asset_ID**: The asset ID corresponding to one of the crytocurrencies (e.g. `Asset_ID = 1` for Bitcoin). The mapping from `Asset_ID` to crypto asset is contained in `asset_details.csv`.
*   **Count**: Total number of trades in the time interval (last minute).
*   **Open**:	Opening price of the time interval (in USD).
*   **High**:	Highest price reached during time interval (in USD).
*   **Low**: Lowest price reached during time interval (in USD).
*   **Close**:	Closing price of the time interval (in USD).
*   **Volume**:	Quantity of asset bought or sold, displayed in base currency USD.
*   **VWAP**: The average price of the asset over the time interval, weighted by volume. VWAP is an aggregated form of trade data.
*   **Target**: Residual log-returns for the asset over a 15 minute horizon. 

The first two columns define the time and asset indexes for this data row. The 6 middle columns are feature columns with the trading data for this asset and minute in time. The last column is the prediction target, which we will get to later in more detail.

We also view the asset information, including the list of all assets, the `Asset_ID` to asset mapping, and the weight of each asset used to weigh their relative importance in the evaluation metric.

In [None]:
asset_details = pd.read_csv(data_folder + 'asset_details.csv')
asset_details.sort_values(by=['Asset_ID'])

# Dealing with missing data

Let us inspect the data for another important asset, Ethereum

In [None]:
#Example with Ethereum data
eth = crypto_df_train[crypto_df_train["Asset_ID"]==6].set_index("timestamp") # Asset_ID = 6 for Ethereum
eth.info(show_counts =True)

We can see the number of rows in the training set, and that there are missing values for the targets columns, which we will address later. Let's confirm that:

In [None]:
eth.isna().sum()

In [None]:
beg_eth = eth.index[0].astype('datetime64[s]')
end_eth = eth.index[-1].astype('datetime64[s]')

print('Ethereum data goes from ', beg_eth, 'to ', end_eth)

Missing asset data, for a given minute, is not represented by NaN's, but instead by the absence of those rows. We can check the timestamp difference between consecutive rows to see if there is missing data.

In [None]:
(eth.index[1:]-eth.index[:-1]).value_counts().head()

Notice that there are many gaps in the data. To work with most time series models, we should preprocess our data into a format without time gaps. To fill the gaps, we can use the .reindex() method for forward filling, filling gaps with the previous valid value.

In [None]:
eth = eth.reindex(range(eth.index[0],eth.index[-1]+60,60),method='pad')

In [None]:
(eth.index[1:]-eth.index[:-1]).value_counts().head()

In [None]:
eth.info(show_counts =True)

In [None]:
eth.isna().sum()

# Candlestick charts

The trading data format is an aggregated form of market data including for Open, High, Low and Close. We can visualize this data through the commonly used candlestick bar chart, which allows traders to perform technical analysis on intraday values. The bar's body length represents the price range between the open and close of that day's trading. When the bar is red, it means the close was lower than the open, and green otherwise. These are also referred to as bullish and bearish candlesticks. The wicks above and below the bars show the high and low prices of that interval's trading.

We can visualize a slice of the Bitcoin prices using the `plotly` library. The bottom part of the plot shows a rangeslider, which you can use to zoom in the plot.

In [None]:
import plotly.graph_objects as go
eth_mini = eth.iloc[-200:] # Select recent data rows
fig = go.Figure(data=[go.Candlestick(x=eth_mini.index, open=eth_mini['Open'], high=eth_mini['High'], low=eth_mini['Low'], close=eth_mini['Close'])])
fig.show()

# Log returns

In order to analyze price changes for an asset we can deal with the price difference. However, different assets exhibit different price scales, so that the their returns are not readily comparable. We can solve this problem by computing the percentage change in price instead, also known as the return. This return coincides with the percentage change in our invested capital.

Returns are widely used in finance, however log returns are preferred for mathematical modelling of time series, as they are additive across time. Also, while regular returns cannot go below -100%, log returns are not bounded.

To compute the log return, we can simply take the logarithm of the ratio between two consecutive prices. The first row will have an empty return as the previous value is unknown, therefore the empty return data point will be dropped.

In [None]:
# define function to compute log returns
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

In [None]:
# Example with Ethereum data
import scipy.stats as stats

lret_eth = log_return(eth_mini.Close)[1:]

lret_eth.rename('lret_eth', inplace=True)

plt.figure(figsize=(8,4))
plt.plot(lret_eth);
plt.show()

# Correlation between assets

In [None]:
# create dataframe with returns for all assets
all_assets = pd.DataFrame([])
for asset_id, asset_name in zip(asset_details.Asset_ID, asset_details.Asset_Name):
    asset = crypto_df_train[crypto_df_train["Asset_ID"]==asset_id].set_index("timestamp")
    asset = asset.reindex(range(asset.index[0],asset.index[-1]+60,60),method='pad')
    lret = log_return(asset.Close.fillna(0))[1:]
    all_assets = all_assets.join(lret, rsuffix=asset_name, how="outer")

In [None]:
plt.imshow(all_assets.corr());
plt.yticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values);
plt.xticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values, rotation='vertical');
plt.colorbar();

This forecasting competition aims to predict returns in the near future for prices $P^a$, for each asset $a$. For each row in the dataset, we include the target for prediction, `Target`. `Target` is derived from log returns ($R^a$) over 15 minutes.

$$R^a(t) = log (P^a(t+16)\ /\ P^a(t+1))$$

Crypto asset returns are highly correlated, following to a large extend the overall crypto market. As we want to test your ability to predict returns for individual assets, we perform a linear residualization, removing the market signal from individual asset returns when creating the target. In more detail, if $M(t)$ is the weighted average market returns, the target is:

$$M(t) = \frac{\sum_a w^a R^a(t)}{\sum_a w^a}  \\
\beta^a = \frac{\langle M \cdot R^a \rangle}{\langle M^2 \rangle} \\
\text{Target}^a(t) = R^a(t) - \beta^a M(t)$$

where the bracket $\langle .\rangle$ represent the rolling average over time (3750 minute windows), and same asset weights $w^a$ used for the evaluation metric.

Some rows have null values for targets due to missing values in future prices. Rows with nulls in the test set ground truth are ignored for scoring purposes.

In the competition, your predictions will be evaluated on a weighted version of the Pearson correlation coefficient, with weights given by the `Weight` column in the Asset Details file.

In this tutorial, we will simplify things and use correlation (without weights) for evaluation, and consider only two assets, BTC and ETH.

In [None]:
btc = crypto_df_train[crypto_df_train["Asset_ID"]==1].set_index("timestamp")
btc = asset.reindex(range(asset.index[0],asset.index[-1]+60,60),method='pad')

In [None]:
btc.head(10)

In [None]:
# Select some input features from the trading data: 
# 5 min log return, abs(5 min log return), upper shadow, and lower shadow.
upper_shadow = lambda asset: asset.High - np.maximum(asset.Close,asset.Open)
lower_shadow = lambda asset: np.minimum(asset.Close,asset.Open)- asset.Low

X_btc = pd.concat([log_return(btc.VWAP,periods=5), log_return(btc.VWAP,periods=1).abs(), 
               upper_shadow(btc), lower_shadow(btc)], axis=1)
y_btc = btc.Target

In [None]:
import time

# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

# select training and test periods
train_window = [totimestamp("01/01/2018"), totimestamp("12/05/2021")]
test_window = [totimestamp("13/05/2021"), totimestamp("12/06/2021")]

# divide data into train and test, compute X and y
# we aim to build simple regression models using a window_size of 1
X_btc_train = X_btc.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  # filling NaN's with zeros
y_btc_train = y_btc.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  

X_btc_test = X_btc.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
y_btc_test = y_btc.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 

We now standardize the input data. Standardization is the process of putting different variables on the same scale. In regression analysis, it is often crucial to standardize your independent variables or you may risk obtaining misleading results.

In [None]:
from sklearn.preprocessing import StandardScaler
# simple preprocessing of the data 
scaler = StandardScaler()

X_btc_train_scaled = scaler.fit_transform(X_btc_train)
X_btc_test_scaled = scaler.transform(X_btc_test)

# Baseline model: Linear Regression

We will try a simple Linear Regression model on the features we designed. Note that Linear Regression is not commonly used in time series analysis, specially with only one time step! 

We compare two Linear Regression baselines, one that considers each asset independently and one multiple inputs that models all assets together.

In [None]:
from sklearn.linear_model import LinearRegression

# implement basic ML baseline (one per asset)
lr = LinearRegression()
lr.fit(X_btc_train_scaled,y_btc_train)
y_pred_lr_btc = lr.predict(X_btc_test_scaled)

In [None]:
print('Test score for LR baseline: BTC', f"{np.corrcoef(y_pred_lr_btc, y_btc_test)[0,1]:.2f}")

We shall forecast three months (30 days) worth of data, based on the 180 days prior to the start of the forecasting period

Defining Class errors having all the evaluation metrics

In [None]:
test_df.head(5)

# LGBM pipeline

Ref:

**Credits:**
The following notebook is heavily based on the following notebooks. If you find it useful, spare some upvotes to the originals. They earned it!

* __[G-Research: LGBM pipeline Notebook](https://www.kaggle.com/julian3833/g-research-starter-lgbm-pipeline) (notebook)__

## Training with LGBM

In [None]:
from lightgbm import LGBMRegressor
import gresearch_crypto

### Hyperparameter Tuning

Post Hyperparameter Tuning in Google colab with more resource I have got the below best parameter to train :

- {'**learning_rate**': 0.04945819653484207, 
    '**boosting_type**': 'dart', 
    'objective': 'regression', 
    'metric': 'mae', 
    'sub_feature': 0.12567008013847558, 
    '**num_leaves**': 160, 
    'min_data': 93, 
    '**max_depth**': 136} 

In [None]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    return df_feat.fillna(0)

def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    # TODO: Try different features here!
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"].fillna(0)
    
    # TODO: Try different models here!
    model = LGBMRegressor(
    boosting_type = 'dart',
    num_leaves = 160,
    max_depth = 136,
    learning_rate = 0.04945819653484207,
    n_estimators = 100)
    model.fit(X, y)
    return X, y, model

In [None]:
crypto_df_train['datetime'] = pd.to_datetime(crypto_df_train['timestamp'], unit='s')
crypto_df_train = crypto_df_train.set_index('datetime')
crypto_df_train = crypto_df_train[(crypto_df_train.index.year == 2021) & (crypto_df_train.index.month > 5)]
crypto_df_train = crypto_df_train.set_index('timestamp')

In [None]:
crypto_df_train

In [None]:
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(asset_details['Asset_ID'], asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    X, y, model = get_Xy_and_model_for_asset(crypto_df_train, asset_id)    
    Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model

In [None]:
crypto_df_train.iloc[1]

In [None]:
# Check the model interface
x = get_features(crypto_df_train.iloc[1])
y_pred = models[0].predict([x])
y_pred[0]

In [None]:
all_df_test = []

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        model = models[row['Asset_ID']]
        x_test = get_features(row)
        y_pred = model.predict([x_test])[0]
        
        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
        
        
        # Print just one sample row to get a feeling of what it looks like
        if i == 0 and j == 0:
            display(x_test)

    # Display the first prediction dataframe
    if i == 0:
        display(df_pred)
    all_df_test.append(df_test)

    # Send submissions
    env.predict(df_pred)