### Evaluation Metric
While mean squared error, R^2, explained variance, and correlation are all very closely related, correlation has the useful property that it tends to normalize leading-order volatility out of the covariance between target and prediction. In financial markets (especially crypto ones!), predicting volatility is a difficult (but interesting!) question in its own right. By using correlation as a metric we hope to remove some noise from the prediction problem and provide a more stable metric to evaluate against.

### Target
The target calculation is based on the close price of the asset and can be derived from the provided data using the methodology in https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition. 

### Weights
In this competition, the weights are determined by the logarithm of each product's market cap (in USD), of the cryptocurrencies at a fixed point in time. Weights were assigned to give more relevance to cryptocurrencies with higher market volumes to ensure smaller cryptocurrencies do not disproportionately impact your models.

In [None]:


!cp ../input/talibinstall/ta-lib-0.4.0-src.tar.gzh  ./ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz > null
!cd ta-lib && ./configure --prefix=/usr > null && make  > null && make install > null



!cp ../input/talibinstall/TA-Lib-0.4.21.tar.gzh TA-Lib-0.4.21.tar.gz
!pip install TA-Lib-0.4.21.tar.gz
!pip install ../input/talibinstall/numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
import talib as ta



In [None]:
# !pip install ta-lib

In [None]:
# Import Libraries
import warnings # Supress warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

from datetime import datetime

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

from PIL import Image
import cv2
import talib as ta


asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv', low_memory=False)
train = pd.read_csv('../input/g-research-crypto-forecasting/train.csv', low_memory=False)
#supplemental_train = pd.read_csv('supplemental_train.csv', low_memory=False)
#example_test = pd.read_csv('example_test.csv', low_memory=False)
#example_sample_submission = pd.read_csv('example_sample_submission.csv', low_memory=False)

rename_dict = {}
for a in asset_details['Asset_ID']:
    rename_dict[a] = asset_details[asset_details.Asset_ID == a].Asset_Name.values[0]

display(asset_details)

### Preprocessing
For the following EDA, resampling of the minute-wise crypto data to daily samples is applied which reduces the amount of samples from 24,236,806 to 1,360.

In [None]:
asset_details.Weight.sum()

The weight of each asset used to weigh their relative importance in the evaluation metric.

### Check BTC

In [None]:
btc =train[train.Asset_ID==1].reset_index(drop=True) 
btc

In [None]:
btc['date'] = btc.timestamp.astype('datetime64[s]')
btc['date'] = btc['date'].astype(str)
btc['time'] = btc['date'].apply(lambda x: x[11:])
btc

In [None]:
btc =btc[btc.time=='00:01:00'].reset_index(drop=True) 
btc

### HEATMAP

In [None]:
import seaborn as sns
plt.figure(figsize=(10,8))
sns.heatmap(btc[['Count','Open','High','Low','Close','Volume','VWAP','Target']].corr(), vmin=-1.0, vmax=1.0, annot=True, cmap='coolwarm', linewidths=0.1)
plt.show()

In [None]:
fig = plt.figure(figsize=(30,20))
x=1
for i in train.Asset_ID.unique():
    money = train[train.Asset_ID==i].reset_index(drop=True) 
    fig.add_subplot(4, 4, x)
    a = str(x-1) +' ' + asset_details[asset_details.Asset_ID==i].Asset_Name.values 
    plt.title(a,fontsize=18)
    plt.plot(money.index,money.Target)
    x +=1
del money

Waveforms of 0 and 5 are very similar
<br>・ Waveforms of 1 and 7 are similar
<br>・ Waveforms of 3,8,12,13 are a little similar
<br>・ Waveforms of 2, 9 and 10 are a little similar
<br>・ There are not many similarities between 6 and 11

In [None]:
fig = plt.figure(figsize=(30,20))
data =train #[-10000:]
for i in data.Asset_ID.unique():
    money = data[data.Asset_ID==i].reset_index(drop=True) 
    plt.plot(money.index,money.Target)
del data

In [None]:
fig = plt.figure(figsize=(30,20))
data =train[-10000:]
for i in data.Asset_ID.unique():
    money = data[data.Asset_ID==i].reset_index(drop=True) 
    plt.plot(money.index,money.Target)
del data

In [None]:
fig = plt.figure(figsize=(30,20))
data =train[-1000:]
for i in data.Asset_ID.unique():
    money = data[data.Asset_ID==i].reset_index(drop=True) 
    plt.plot(money.index,money.Target)

### HeatMap Coins

Some coins have a strong correlation.

In [None]:
check = pd.DataFrame()
for i in data.Asset_ID.unique():
    check[i] = data[data.Asset_ID==i]['Target'].reset_index(drop=True) 

In [None]:
import seaborn as sns
plt.figure(figsize=(10,8))
sns.heatmap(check.dropna().corr(), vmin=-1.0, vmax=1.0, annot=True, cmap='coolwarm', linewidths=0.1)
plt.show()
del check

In [None]:
for i in  train.Asset_ID.unique():
    check = train[train.Asset_ID==i].reset_index(drop=True) 
    print('Asset_ID=', pd.to_datetime(check.loc[0,'timestamp'],unit="s",infer_datetime_format=True))

In [None]:
check

### Open/Close

In [None]:
fig = plt.figure(figsize=(30,20))
x=1
for i in train.Asset_ID.unique():
    money = train[train.Asset_ID==i].reset_index(drop=True) 
    fig.add_subplot(4, 4, x)
    plt.title(asset_details[asset_details.Asset_ID==i].Asset_Name.values,fontsize=18)
    plt.plot(money.index,money.Open,color="red")
    plt.plot(money.index,money.Close,color="blue")
    x +=1

### High/Low

In [None]:
fig = plt.figure(figsize=(30,20))
x=1
for i in train.Asset_ID.unique():
    money = train[train.Asset_ID==i].reset_index(drop=True) 
    fig.add_subplot(4, 4, x)
    plt.title(asset_details[asset_details.Asset_ID==i].Asset_Name.values,fontsize=18)
    plt.plot(money.index,money.High,color="red")
    plt.plot(money.index,money.Low,color="blue")
    x +=1
del money

### Volume

In [None]:
fig = plt.figure(figsize=(30,20))
x=1
for i in train.Asset_ID.unique():
    money = train[train.Asset_ID==i].reset_index(drop=True) 
    fig.add_subplot(4, 4, x)
    plt.title(asset_details[asset_details.Asset_ID==i].Asset_Name.values,fontsize=18)
    plt.plot(money.index,money.Volume)
    x +=1
del money

### VWAP

In [None]:
fig = plt.figure(figsize=(30,20))
x=1
for i in train.Asset_ID.unique():
    money = train[train.Asset_ID==i].reset_index(drop=True) 
    fig.add_subplot(4, 4, x)
    plt.title(asset_details[asset_details.Asset_ID==i].Asset_Name.values,fontsize=18)
    plt.plot(money.index,money.VWAP)
    x +=1
    
del money

### Count

In [None]:
fig = plt.figure(figsize=(30,20))
x=1
for i in train.Asset_ID.unique():
    money = train[train.Asset_ID==i].reset_index(drop=True) 
    fig.add_subplot(4, 4, x)
    plt.title(asset_details[asset_details.Asset_ID==i].Asset_Name.values,fontsize=18)
    plt.plot(money.index,money.Count)
    x +=1
del money

In [None]:
btc['high_low'] = btc['High'] - btc['Low']
btc['open_close'] = btc['Open'] - btc['Close']

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(btc[['Count','Open','High','Low','Close','Volume','high_low','open_close','VWAP','Target']].corr(), vmin=-1.0, vmax=1.0, annot=True, cmap='coolwarm', linewidths=0.1)
plt.show()

### MACD

In [None]:
fig = plt.figure(figsize=(30,20))
x=1
for i in train.Asset_ID.unique():
    money = train[train.Asset_ID==i].reset_index(drop=True) 
    fig.add_subplot(4, 4, x)
    plt.title(asset_details[asset_details.Asset_ID==i].Asset_Name.values,fontsize=18)
    money['MACD'],_,_ = ta.MACD(money['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    plt.plot(money.index,money.MACD)
    x +=1

### RSI

In [None]:
fig = plt.figure(figsize=(30,20))
x=1
for i in train.Asset_ID.unique():
    money = train[train.Asset_ID==i].reset_index(drop=True) 
    fig.add_subplot(4, 4, x)
    plt.title(asset_details[asset_details.Asset_ID==i].Asset_Name.values,fontsize=18)
    money["RSI"]=ta.RSI(money['Close'], timeperiod=14)
    plt.plot(money.index,money.RSI)
    x +=1

In [None]:
train.info()

In [None]:
display(train)

In [None]:
# Convert timestamp
train['timestamp'] = train['timestamp'].astype('datetime64[s]')

# Resample
train_daily = pd.DataFrame()

for asset_id in asset_details.Asset_ID:
    train_single = train[train.Asset_ID == asset_id].copy()

    train_single_new = train_single[['timestamp','Count']].resample('D', on='timestamp').sum()
    train_single_new['Open'] = train_single[['timestamp','Open']].resample('D', on='timestamp').first()['Open']
    train_single_new['High'] = train_single[['timestamp','High']].resample('D', on='timestamp').max()['High']
    train_single_new['Low'] = train_single[['timestamp','Low']].resample('D', on='timestamp').min()['Low']
    train_single_new['Close'] = train_single[['timestamp','Close']].resample('D', on='timestamp').last()['Close']
    train_single_new['Volume'] = train_single[['timestamp','Volume']].resample('D', on='timestamp').sum()['Volume']
    train_single_new['Target'] = train_single[['timestamp','Target']].resample('D', on='timestamp').mean()['Target']
    train_single_new['Asset_ID'] = asset_id
    
    train_daily = train_daily.append(train_single_new.reset_index(drop=False))
    
train_daily = train_daily.sort_values(by = ['timestamp', 'Asset_ID']).reset_index(drop=True)

train_daily = train_daily.pivot(index='timestamp', columns='Asset_ID')[['Count', 'Open', 'High', 'Low', 'Close', 'Volume']]
train_daily = train_daily.reset_index(drop=False)

display(train_daily.head(10))

## Data Overview
Furthermore, we have samples from 2018-01-01 to 2021-09-21 for the majority of coins. For TRON, Stellar, Cardano, IOTA, Maker, and Dogecoin we have fewer data starting from later in 2018 or even later in 2019 in Dogecoin's case.

In [None]:
train_daily.info()

In [None]:
len(asset_details.Asset_ID)

In [None]:
asset_details.Asset_Name

In [None]:
# Visualize
fig = make_subplots(
    rows=len(asset_details.Asset_ID), cols=1,
    subplot_titles=(asset_details.Asset_Name)
)

for i, asset_id in enumerate(asset_details.Asset_ID):
    fig.append_trace(go.Candlestick(x=train_daily.timestamp, 
                                         open=train_daily[('Open', asset_id)], 
                                         high=train_daily[('High', asset_id)], 
                                         low=train_daily[('Low', asset_id)], 
                                         close=train_daily[('Close', asset_id)]),
                  row=i+1, col=1,
                    )

    fig.update_xaxes(range=[train_daily.timestamp.iloc[0], train_daily.timestamp.iloc[-1]], row=i+1, col=1)
    
fig.update_layout(xaxis_rangeslider_visible = False, 
                  xaxis2_rangeslider_visible = False, 
                  xaxis3_rangeslider_visible = False,
                  xaxis4_rangeslider_visible = False,
                  xaxis5_rangeslider_visible = False,
                  xaxis6_rangeslider_visible = False,
                  xaxis7_rangeslider_visible = False,
                  xaxis8_rangeslider_visible = False,
                  xaxis9_rangeslider_visible = False,
                  xaxis10_rangeslider_visible = False,
                  xaxis11_rangeslider_visible = False,
                  xaxis12_rangeslider_visible = False,
                  xaxis13_rangeslider_visible = False,
                  xaxis14_rangeslider_visible = False,
                  height=3000, width=800, 
                  #title_text="Subplots with Annotations"
                      margin = dict(
        l = 0,
        r = 0,
        b = 0,
        t = 30,
        pad = 0)
                 )
                    
fig.show()

### Feature Engineering 1: Time Features

In [None]:
train_daily['year'] = pd.DatetimeIndex(train_daily['timestamp']).year
train_daily['quarter'] = pd.DatetimeIndex(train_daily['timestamp']).quarter
train_daily['month'] = pd.DatetimeIndex(train_daily['timestamp']).month
train_daily['weekofyear'] = pd.DatetimeIndex(train_daily['timestamp']).weekofyear
train_daily['dayofyear'] = pd.DatetimeIndex(train_daily['timestamp']).dayofyear
train_daily['weekday'] = pd.DatetimeIndex(train_daily['timestamp']).weekday

### Stationarity
The 'Close' prices seem to be mostly non-stationary. However, Bitcoin and Ethereum seem to be stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller

def check_stationarity(series, asset_id):
    # Copied and edited from https://machinelearningmastery.com/time-series-data-stationary-python/

    result = adfuller(series.values)
    if (result[1] <= 0.05) & (result[4]['5%'] > result[0]):
        print(f"{asset_details[asset_details.Asset_ID == asset_id].Asset_Name.values[0]}: \u001b[32mStationary\u001b[0m")
    else:
        print(f"{asset_details[asset_details.Asset_ID == asset_id].Asset_Name.values[0]}: \x1b[31mNon-stationary\x1b[0m")
       
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
    print('\n')
for i, asset_id in enumerate(asset_details.Asset_ID):    
    check_stationarity(train_daily[('Close', i)].fillna(0), asset_id)

### Feature Engineering 2: Log Return
To make a time series stationary, you can try differencing it. In this case, we will use the log return instead as shown below.

In [None]:
# define function to compute log returns
def log_return(series, periods=1):
    # Copied from https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition
    return np.log(series).diff(periods=periods)


for i, asset_id in enumerate(asset_details.Asset_ID):
    train_daily[('lret',  asset_id)] = log_return(train_daily[( 'Close',  asset_id)])

### Determining Trend with Time-Series Decomposition

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

periods = [7, 28, 365]
   
asset_id = 1 # Bitcoin
# Visualize
f, ax = plt.subplots(nrows=len(periods), ncols=1, figsize=(12, 12))
for i, p in enumerate(periods):
    decomp = seasonal_decompose(train_daily[('Close',  asset_id)].fillna(0), period=p, model='additive', extrapolate_trend='freq')
    train_daily[(f'Trend_{p}',  asset_id)] = np.where(train_daily[('Close',  asset_id)].isna(), np.NaN, decomp.trend) #decomp.trend
    
    
    sns.lineplot(data=train_daily, x='timestamp', y = ('Close',  asset_id) , ax=ax[i], color='lightgrey');
    sns.lineplot(data=train_daily, x='timestamp', y = (f'Trend_{p}',  asset_id) , ax=ax[i], color='red');
    ax[i].set_title(f"{asset_details[asset_details.Asset_ID == asset_id].Asset_Name.values[0]} Trend with a Period of {p} Day")
    ax[i].set_xlim([train_daily.timestamp.iloc[0], train_daily.timestamp.iloc[-1]])
    #ax[i].set_ylim([-0.6,0.6])
    ax[i].set_ylabel('Close Price [$]')
    
#plt.suptitle(f'Underlying Trend with {PERIOD} day period\n')
plt.tight_layout()
plt.show()

In [None]:
trend = train_daily[('Close',  1)].rolling(
    window=365,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=183,  # choose about half the window size
).mean()   

f, ax = plt.subplots(nrows=1, ncols=1, figsize=(12,5))

sns.lineplot(data=train_daily, x='timestamp', y = ('Close',  1) , ax=ax, color='lightgrey');
sns.lineplot(x=train_daily['timestamp'], y = trend, ax=ax, color='red');
#ax[i].set_title(f"{asset_details[asset_details.Asset_ID == asset_id].Asset_Name.values[0]} Trend with a Period of {p} Day")
ax.set_xlim([train_daily.timestamp.iloc[0], train_daily.timestamp.iloc[-1]])
#ax[i].set_ylabel('Close Price [$]')

#plt.suptitle(f'Underlying Trend with {PERIOD} day period\n')
plt.show()