In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

data_folder = "../input/g-research-crypto-forecasting/"

#Check the train set
crypto_df = pd.read_csv(data_folder + 'train.csv')
crypto_df.head(10)

In [None]:
#View the asset information, including the list of all assets and the weight of each asset used to weigh their relative importance in the evaluation metric
asset_details = pd.read_csv(data_folder + 'asset_details.csv')
asset_details

In [None]:
#Construct candlestick chart for slice of data for Bitcoin
btc = crypto_df[crypto_df["Asset_ID"]==1].set_index("timestamp") # Asset_ID = 1 for Bitcoin
btc_mini = btc.iloc[-200:] # Select recent data rows
import plotly.graph_objects as go

fig = go.Figure(data=[go.Candlestick(x=btc_mini.index, open=btc_mini['Open'], high=btc_mini['High'], low=btc_mini['Low'], close=btc_mini['Close'])])
fig.show()

In [None]:
#Inspect data for ethereum

eth = crypto_df[crypto_df["Asset_ID"]==6].set_index("timestamp") # Asset_ID = 6 for Ethereum
eth.info(show_counts =True)

# Results suggest missing data for target

In [None]:
#Let's check the time range for Bitcoin and Ethereum data, using the coversion from timestamp to datetime
beg_btc = btc.index[0].astype('datetime64[s]')
end_btc = btc.index[-1].astype('datetime64[s]')
beg_eth = eth.index[0].astype('datetime64[s]')
end_eth = eth.index[-1].astype('datetime64[s]')

print('BTC data goes from ', beg_btc, 'to ', end_btc)
print('Ethereum data goes from ', beg_eth, 'to ', end_eth)

#Missing asset data, for a given minute, is not represented by NaN's, but instead by the absence of those rows.
#We can check the timestamp difference between consecutive rows to see if there is missing data.

In [None]:
(eth.index[1:]-eth.index[:-1]).value_counts().head()

#Notice that there are many gaps in the data. 
#We should preprocess our data into a format without time gaps. 

In [None]:
#Padding to fix the time gaps issue
eth = eth.reindex(range(eth.index[0],eth.index[-1]+60,60),method='pad')

#Check that there are no time gaps now
(eth.index[1:]-eth.index[:-1]).value_counts().head()

In [None]:
#Visualising the Close prices for the two assets we have selected

import matplotlib.pyplot as plt

# plot vwap time series for both chosen assets
f = plt.figure(figsize=(15,4))

# fill missing values for BTC
btc = btc.reindex(range(btc.index[0],btc.index[-1]+60,60),method='pad')

ax = f.add_subplot(121)
plt.plot(btc['Close'], label='BTC')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Bitcoin')

ax2 = f.add_subplot(122)
ax2.plot(eth['Close'], color='red', label='ETH')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Ethereum')

plt.tight_layout()
plt.show()

#The assets have quite different history, but we could check if they correlate in recent times.

In [None]:
import time

# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

# create intervals
btc_mini_2021 = btc.loc[totimestamp('01/06/2021'):totimestamp('01/07/2021')]
eth_mini_2021 = eth.loc[totimestamp('01/06/2021'):totimestamp('01/07/2021')]

# plot time series for both chosen assets
f = plt.figure(figsize=(7,8))

ax = f.add_subplot(211)
plt.plot(btc_mini_2021['Close'], label='btc')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Bitcoin Close')

ax2 = f.add_subplot(212)
ax2.plot(eth_mini_2021['Close'], color='red', label='eth')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Ethereum Close')

plt.tight_layout()
plt.show()

#On shorter intervals we can visually see some potential correlation between both assets, with some simultaneous ups and downs. 
#A better format for analyzing such movements is by calculating asset returns.

In [None]:
# define function to compute log returns
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

#visualize the log return for our two assets
import scipy.stats as stats

lret_btc = log_return(btc_mini_2021.Close)[1:]
lret_eth = log_return(eth_mini_2021.Close)[1:]
lret_btc.rename('lret_btc', inplace=True)
lret_eth.rename('lret_eth', inplace=True)

plt.figure(figsize=(8,4))
plt.plot(lret_btc);
plt.plot(lret_eth);
plt.show()

In [None]:
#Investigate correlation between bitcoin and ethereum
# join two asset in single DataFrame

lret_btc_long = log_return(btc.Close)[1:]
lret_eth_long = log_return(eth.Close)[1:]
lret_btc_long.rename('lret_btc', inplace=True)
lret_eth_long.rename('lret_eth', inplace=True)
two_assets = pd.concat([lret_btc_long, lret_eth_long], axis=1)

# group consecutive rows and use .corr() for correlation between columns
corr_time = two_assets.groupby(two_assets.index//(10000*60)).corr().loc[:,"lret_btc"].loc[:,"lret_eth"]

corr_time.plot();
plt.xticks([])
plt.ylabel("Correlation")
plt.title("Correlation between BTC and ETH over time");

#Note the high but variable correlation between the assets. 
#There is some changing dynamics over time. Need to perform forecasts in a highly non-stationary environment.

In [None]:
#Check the correlation between all assets visualizing the correlation matrix. 
#Note how some assets have much higher pairwise correlation than others

# Create dataframe with returns for all assets
all_assets_2021_returns = pd.DataFrame([])

for asset_id, asset_name in zip(asset_details.Asset_ID, asset_details.Asset_Name):
    asset = crypto_df[crypto_df["Asset_ID"]==asset_id].set_index("timestamp")
    asset = asset.loc[totimestamp('01/01/2021'):totimestamp('21/09/2021')]
    asset = asset.reindex(range(asset.index[0],asset.index[-1]+60,60),method='pad')
    lret = log_return(asset.Close.fillna(0))[1:]
    all_assets_2021_returns = all_assets_2021_returns.join(lret, rsuffix=asset_name, how="outer")

plt.imshow(all_assets_2021_returns.corr());
plt.yticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values);
plt.xticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values, rotation='vertical');
plt.colorbar();

In [None]:
#Set timestamp and Asset ID as indexes and drop rows for targets with missing values

crypto_df=crypto_df.set_index(['timestamp','Asset_ID']).sort_index()
crypto_df_2021 = crypto_df.loc[totimestamp('01/01/2021'):totimestamp('21/09/2021')]
crypto_df_2021_clean=crypto_df_2021.dropna(axis=0)

In [None]:
# Select some input features from the trading data: 
# 5 min log return, abs(5 min log return), upper shadow, and lower shadow.
upper_shadow = lambda asset: asset.High - np.maximum(asset.Close,asset.Open)
lower_shadow = lambda asset: np.minimum(asset.Close,asset.Open)- asset.Low

X = pd.concat([log_return(crypto_df_2021_clean.VWAP,periods=5), log_return(crypto_df_2021_clean.VWAP,periods=1).abs(), 
               upper_shadow(crypto_df_2021_clean), lower_shadow(crypto_df_2021_clean)], axis=1)
y = crypto_df_2021_clean.Target

In [None]:
# select training and test periods
train_window = [totimestamp("01/01/2021"), totimestamp("31/08/2021")]
test_window = [totimestamp("01/09/2021"), totimestamp("21/09/2021")]

# divide data into train and test, compute X and y
# we aim to build simple regression models using a window_size of 1
X_train = X.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  # filling NaN's with zeros
y_train = y.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  
X_test = X.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
y_test = y.loc[test_window[0]:test_window[1]].fillna(0).to_numpy()

In [None]:
#Standardise input data

from sklearn.preprocessing import StandardScaler
# Simple preprocessing of the selected assets
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Model the data

from xgboost import XGBRegressor
model=XGBRegressor()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

#The competition performance metric is weighted correlation. 
#However, for now we will use simple correlation to evaluate the two baseline models built.
print(np.corrcoef(y_pred, y_test)[0,1])

In [None]:
import gresearch_crypto
env = gresearch_crypto.make_env()   # initialize the environment

In [None]:
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    test_df.set_index(['timestamp','Asset_ID']).sort_index()
    X_test_submission = pd.concat([log_return(test_df.VWAP,periods=5), log_return(test_df.VWAP,periods=1).abs(), 
               upper_shadow(test_df), lower_shadow(test_df)], axis=1)
    X_test_submission = X_test_submission.fillna(0).to_numpy()
    X_test_submission = scaler.transform(X_test_submission)
    sample_prediction_df['Target'] = model.predict(X_test_submission )
    env.predict(sample_prediction_df)   # register your predictions