In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime # Convert date and time into a timestep

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading Data

In [None]:
data_file = "../input/g-research-crypto-forecasting/"
!ls $data_file

## Stored In Variable

In [None]:
cf_df = pd.read_csv(data_file + 'train.csv')

In [None]:
cf_df.head(8)

## Data features
-  **timestamp**: All timestamps are returned as second Unix timestamps (the number of seconds elapsed since 1970-01-01 00:00:00.000 UTC). Timestamps in this dataset are multiple of 60, indicating minute-by-minute data.

In [None]:
asset_details = pd.read_csv(data_file + 'asset_details.csv')
asset_details

In [None]:
btcoin = cf_df[cf_df["Asset_ID"]==1].set_index("timestamp") # Asset_ID = 1 for Bitcoin
btcoin_min = btcoin.iloc[-200:] # Select recent data rows

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Candlestick(x=btcoin_min.index, open=btcoin_min['Open'], high=btcoin_min['High'], low=btcoin_min['Low'], close=btcoin_min['Close'])])
fig.show()

# Preprocessing 
- 

## Dealing with missing data 

In [None]:
miss = cf_df[cf_df["Asset_ID"]==6].set_index("timestamp") # Asset_ID = 6 for Ethereum
miss.info(show_counts =True)

In [None]:
miss.isna().sum()

In [None]:
btcoin.head()

In [None]:
leg_btcoin = btcoin.index[0].astype('datetime64[s]')
end_btcoin = btcoin.index[-1].astype('datetime64[s]')
leg_miss = miss.index[0].astype('datetime64[s]')
end_miss = miss.index[-1].astype('datetime64[s]')

print('BTCOIN data goes from ', leg_btcoin, 'to ', end_btcoin)
print('Ethereum data goes from ', leg_miss, 'to ', end_miss)

- Missing asset data, for a given minute, is not represented by NaN's, but instead by the absence of those rows. We can check the timestamp difference between consecutive rows to see if there is missing data.

In [None]:
(miss.index[1:]-miss.index[:-1]).value_counts().head()

- Notice that there are many gaps in the data. To work with most time series models, we should preprocess our data into a format without time gaps. To fill the gaps, we can use the `.reindex()` method for forward filling, filling gaps with the previous valid value. 


In [None]:
miss = miss.reindex(range(miss.index[0],miss.index[-1]+60,60),method='pad')

In [None]:
(miss.index[1:]-miss.index[:-1]).value_counts().head()

## Data visualisation
- We  will start by visualising the Close prices for the two assets we have selected.

In [None]:
import matplotlib.pyplot as plt

# plot vwap time series for both chosen assets
f = plt.figure(figsize=(15,4))

# fill missing values for BTCOIN
btcoin = btcoin.reindex(range(btcoin.index[0],btcoin.index[-1]+60,60),method='pad')

ax = f.add_subplot(121)
plt.plot(btcoin['Close'], label='BTCOIN')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Bitcoin')

ax2 = f.add_subplot(122)
ax2.plot(miss['Close'], color='red', label='ETH')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Ethereum') # Ethereum is a blockchain computer program similar to Bitcoin. It can be used to create automated contracts or circulate a digital currency called Ether.

plt.tight_layout()
plt.show()

In [None]:
import time

# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

# create intervals
btcoin_min_2021 = btcoin.loc[totimestamp('01/06/2021'):totimestamp('01/07/2021')]
miss_min_2021 = miss.loc[totimestamp('01/06/2021'):totimestamp('01/07/2021')]

In [None]:
# plot time series for both chosen assets
f = plt.figure(figsize=(7,8))

ax = f.add_subplot(211)
plt.plot(btcoin_min_2021['Close'], label='btc')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Bitcoin Close')

ax2 = f.add_subplot(212)
ax2.plot(miss_min_2021['Close'], color='red', label='eth')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Ethereum Close') 

plt.tight_layout()
plt.show()

- On shorter intervals we can visually see some potential correlation between both assets, with some simultaneous ups and downs. A better format for analyzing such movements is by calculating asset returns. 


## Log returns

## In order to analyze price changes for an asset we can deal with the price difference. However, different assets exhibit different price scales, so that the their returns are not readily comparable. We can solve this problem by computing the percentage change in price instead, also known as the return. This return coincides with the percentage change in our invested capital.

## Returns are widely used in finance, however log returns are preferred for mathematical modelling of time series, as they are additive across time. Also, while regular returns cannot go below -100%, log returns are not bounded.

## To compute the log return, we can simply take the logarithm of the ratio between two consecutive prices. The first row will have an empty return as the previous value is unknown, therefore the empty return data point will be dropped.

In [None]:
# define function to compute log returns
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

## We can visualize the log return for our two assets. See how the signal now looks more like white noise, with less drift than the time series for prices.


In [None]:
import scipy.stats as stats

lret_btc = log_return(btcoin_min_2021.Close)[1:]
lret_eth = log_return(miss_min_2021.Close)[1:]
lret_btc.rename('lret_btc', inplace=True)
lret_eth.rename('lret_eth', inplace=True)

plt.figure(figsize=(8,4))
plt.plot(lret_btc);
plt.plot(lret_eth);
plt.show()

## Correlation between assets

- We hypothesized before that crypto asset returns may exhibit some correlation. Let's check this in more detail now.
- We can check how the correlation between Bitcoin and Ethereum change over time for the 2021 period we selected. 





In [None]:
# join two asset in single DataFrame

lret_btc_long = log_return(btcoin.Close)[1:]
lret_eth_long = log_return(miss.Close)[1:]
lret_btc_long.rename('lret_btc', inplace=True)
lret_eth_long.rename('lret_eth', inplace=True)
two_assets = pd.concat([lret_btc_long, lret_eth_long], axis=1)

# group consecutive rows and use .corr() for correlation between columns
corr_time = two_assets.groupby(two_assets.index//(10000*60)).corr().loc[:,"lret_btc"].loc[:,"lret_eth"]

corr_time.plot();
plt.xticks([])
plt.ylabel("Correlation")
plt.title("Correlation between BTC and ETH over time");

## Note the high but variable correlation between the assets. Here we can see that there is some changing dynamics over time, and this would be critical for this time series challenge, that is, how to perform forecasts in a highly non-stationary environment.


## A stationary behaviour of a system or a process is characterized by non-changing statistical properties over time such as the mean, variance and autocorrelation. On the other hand, a non-stationary behaviour is characterized by a continuous change of statistical properties over time. Stationarity is important because many useful analytical tools and statistical tests and models rely on it.


## We can also check the correlation between all assets visualizing the correlation matrix. Note how some assets have much higher pairwise correlation than others.

In [None]:
# create dataframe with returns for all assets
all_assets_2021 = pd.DataFrame([])
for asset_id, asset_name in zip(asset_details.Asset_ID, asset_details.Asset_Name):
  asset = cf_df[cf_df["Asset_ID"]==asset_id].set_index("timestamp")
  asset = asset.loc[totimestamp('01/01/2021'):totimestamp('01/05/2021')]
  asset = asset.reindex(range(asset.index[0],asset.index[-1]+60,60),method='pad')
  lret = log_return(asset.Close.fillna(0))[1:]
  all_assets_2021 = all_assets_2021.join(lret, rsuffix=asset_name, how="outer")

In [None]:
plt.imshow(all_assets_2021.corr());
plt.yticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values);
plt.xticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values, rotation='vertical');
plt.colorbar();

- We encourage participants to perform additional statistical analyses to have a stronger grasp on the dataset, including autocorrelation, time-series decomposition and stationarity tests.


# Building your prediction model

In [None]:
# Select some input features from the trading data: 
# 5 min log return, abs(5 min log return), upper shadow, and lower shadow.
upper_shadow = lambda asset: asset.High - np.maximum(asset.Close,asset.Open)
lower_shadow = lambda asset: np.minimum(asset.Close,asset.Open)- asset.Low

X_btc = pd.concat([log_return(btcoin.VWAP,periods=5), log_return(btcoin.VWAP,periods=1).abs(), 
               upper_shadow(btcoin), lower_shadow(btcoin)], axis=1)
y_btc = btcoin.Target

X_eth = pd.concat([log_return(miss.VWAP,periods=5), log_return(miss.VWAP,periods=1).abs(), 
               upper_shadow(miss), lower_shadow(miss)], axis=1)
y_eth = miss.Target

In [None]:
# select training and test periods
train_window = [totimestamp("01/05/2021"), totimestamp("30/05/2021")]
test_window = [totimestamp("01/06/2021"), totimestamp("30/06/2021")]

# divide data into train and test, compute X and y
# we aim to build simple regression models using a window_size of 1
X_btc_train = X_btc.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  # filling NaN's with zeros
y_btc_train = y_btc.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  

X_btc_test = X_btc.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
y_btc_test = y_btc.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 

X_eth_train = X_eth.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  
y_eth_train = y_eth.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  

X_eth_test = X_eth.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
y_eth_test = y_eth.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 

## We now standardize the input data. Standardization is the process of putting different variables on the same scale. In regression analysis, it is often crucial to standardize your independent variables or you may risk obtaining misleading results.


In [None]:
from sklearn.preprocessing import StandardScaler
# simple preprocessing of the data 
scaler = StandardScaler()

X_btc_train_scaled = scaler.fit_transform(X_btc_train)
X_btc_test_scaled = scaler.transform(X_btc_test)

X_eth_train_scaled = scaler.fit_transform(X_eth_train)
X_eth_test_scaled = scaler.transform(X_eth_test)

## Baseline model: Linear Regression
- We will try a simple Linear Regression model on the features we designed. Note that Linear Regression is not commonly used in time series analysis, specially with only one time step! 

- We compare two Linear Regression baselines, one that considers each asset independently and one multiple inputs that models all assets together.


In [None]:
from sklearn.linear_model import LinearRegression

# implement basic ML baseline (one per asset)
lr = LinearRegression()
lr.fit(X_btc_train_scaled,y_btc_train)
y_pred_lr_btc = lr.predict(X_btc_test_scaled)

lr.fit(X_eth_train_scaled,y_eth_train)
y_pred_lr_eth = lr.predict(X_eth_test_scaled)

In [None]:
# implement more complex baseline (multiple output regression model)
from sklearn.multioutput import MultiOutputRegressor

# we concatenate X and y for both assets
X_both_train = np.concatenate((X_btc_train_scaled, X_eth_train_scaled), axis=1)
X_both_test = np.concatenate((X_btc_test_scaled, X_eth_test_scaled), axis=1)
y_both_train = np.column_stack((y_btc_train, y_eth_train))
y_both_test = np.column_stack((y_btc_test, y_eth_test))

# define the direct multioutput model and fit it
mlr = MultiOutputRegressor(LinearRegression())
lr.fit(X_both_train,y_both_train)
y_pred_lr_both = lr.predict(X_both_test)

## Evaluate baselines
- The competition performance metric is weighted correlation. However, for now we will use simple correlation to evaluate the two baseline models built.

In [None]:
print('Test score for LR baseline: BITCOIN', f"{np.corrcoef(y_pred_lr_btc, y_btc_test)[0,1]:.2f}", 
                                ', ETHEREUM', f"{np.corrcoef(y_pred_lr_eth, y_eth_test)[0,1]:.2f}")
print('Test score for multiple output LR baseline: BITCOIN', f"{np.corrcoef(y_pred_lr_both[:,0], y_btc_test)[0,1]:.2f}", 
                                                ', ETHEREUM', f"{np.corrcoef(y_pred_lr_both[:,1], y_eth_test)[0,1]:.2f}")

## We can see that, for the training and test periods selected, the multiple asset LR model performs better than simply modelling each asset separately. Note that because the data is highly non-stationary, these results might vary a lot for different periods.


## Submission 

Note that this is a Code Competition, in which you must submit your notebook to be run against the hidden private data. Your notebook should use the provided python time-series API, which ensures that models do not peek forward in time. To use the API, follow the instructions and template in [Code Competition Detailed API instructions](https://www.kaggle.com/eranuragsingh/anurag-detailed-api-introduction/edit) and [Basic Submission Template](https://www.kaggle.com/eranuragsingh/anurag-basic-submission-template/edit).