# Introduction to Crypto Forecasting

On-going work.

In [None]:
import gresearch_crypto
import pandas as pd
import numpy as np
import os

import time
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 999
%config InlineBackend.figure_format = 'retina'

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Training data is in the competition dataset as usual

In [None]:
dtype={'Asset_ID': 'int8', 'Count': 'int32', 'row_id': 'int32', 'Count': 'int32',
       'Open': 'float32', 'High': 'float32', 'Low': 'float32', 'Close': 'float32',
       'Volume': 'float32', 'VWAP': 'float32'}

train_df = pd.read_csv('../input/g-research-crypto-forecasting/train.csv', low_memory=False, dtype=dtype)
asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')

In [None]:
# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

# define function to compute log returns
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

# Asset Details

In [None]:
len(train_df.Asset_ID.unique())

In [None]:
asset_details

# time stamps

In [None]:
len(train_df.timestamp.unique())

In [None]:
print("min date", datetime.fromtimestamp(min(train_df.timestamp.unique())))
print("max date", datetime.fromtimestamp(max(train_df.timestamp.unique())))

In [None]:
plt.plot(np.diff(train_df.timestamp.unique()))

In [None]:
esp = train_df.timestamp.unique()[np.insert(np.diff(train_df.timestamp.unique())>60,False,0)]
val = np.diff(train_df.timestamp.unique())[np.diff(train_df.timestamp.unique())>60]

pd.DataFrame({'day':[datetime.fromtimestamp(timestamp).strftime("%d/%m/%y") for timestamp in esp],'delta':val}).groupby('day').agg([np.size,np.mean])

In [None]:
esp = train_df.timestamp.unique()[np.insert(np.diff(train_df.timestamp.unique())>120,False,0)]
val = np.diff(train_df.timestamp.unique())[np.diff(train_df.timestamp.unique())>120]

pd.DataFrame({'day':[datetime.fromtimestamp(timestamp).strftime("%d/%m/%y") for timestamp in esp],'delta':val}).groupby('day').count()

Weird things happened on 16/10/2019 and 23/10/2019

# outliers

remove target na ?

In [None]:
#train_df = train_df[~train_df.Target.isna()]

# data distribution

In [None]:
train_df['timestamp'].hist(by=train_df['Asset_ID']);

Some data appears with less history. Esp. Asset #10 and #11.

In [None]:
train_df['Count'].hist(by=train_df['Asset_ID'], log=True);

In [None]:
train_df['Open'].hist(by=train_df['Asset_ID'], log=True);

In [None]:
train_df['Volume'].hist(by=train_df['Asset_ID'], log=True);

In [None]:
train_df[~np.isinf(train_df['VWAP'])].hist(by=train_df[~np.isinf(train_df['VWAP'])]['Asset_ID'], log=True);

Seems to have outliers (ultra low volume ?)

# Normalized prices

In [None]:
for i in train_df.Asset_ID.unique():
    df_p = train_df[['Close','timestamp']][train_df.Asset_ID==i]
    df_p.index = df_p.timestamp
    df_p.Close  = df_p.Close/df_p.Close.iloc[0]
    df_p = df_p.drop(columns=['timestamp'])
    plt.plot(df_p,label=asset_details[asset_details.Asset_ID == i].Asset_Name.values[0])
    
plt.legend(bbox_to_anchor=(1.05, 1),fontsize='x-small')
plt.show()

# Returns

Returns in time frame v.s. return between time frames ?

In [None]:
for i in train_df.Asset_ID.unique():
    df_p = train_df[['Close','Open','timestamp']][train_df.Asset_ID==i]
    df_p.index = df_p.timestamp
    df_p.Close  = np.log(df_p.Close/df_p.Open)
    df_p = df_p.drop(columns=['timestamp','Open'])
    plt.plot(df_p,label=asset_details[asset_details.Asset_ID == i].Asset_Name.values[0])
    
plt.legend(loc='upper left',fontsize='x-small')
plt.show()

In [None]:
for i in train_df.Asset_ID.unique():
    
    df_p = train_df[['Close','timestamp']][train_df.Asset_ID==i]
    df_p.index = df_p.timestamp

    df_p.Close  = np.log(df_p.Close/df_p.Close.shift())
    
    #train_df.loc[train_df.Asset_ID==i,'log_ret_diff'] = df_p.Close 
    
    df_p = df_p.drop(columns=['timestamp'])
    plt.plot(df_p,label=asset_details[asset_details.Asset_ID == i].Asset_Name.values[0])
    
plt.legend(loc='upper left',fontsize='x-small')
plt.show()

# volume

In [None]:
for i in train_df.Asset_ID.unique():
    df_p = train_df[['Volume','timestamp']][train_df.Asset_ID==i]
    df_p.index = df_p.timestamp
    df_p = df_p.drop(columns=['timestamp'])
    plt.plot(df_p,label=asset_details[asset_details.Asset_ID == i].Asset_Name.values[0])
    
plt.legend(loc='upper left',fontsize='x-small')
plt.show()

# Volatility

Garmann-Klass estimator for volatility - smoothed. Maybe better with inter time volatility ?

In [None]:
for i in train_df.Asset_ID.unique():
    df_p = train_df[['Open','High','Low','Close','timestamp']][train_df.Asset_ID==i]
    df_p.index = df_p.timestamp
    df_p['GK_vol'] = (1 / 2 * np.log(df_p.High / df_p.Low) ** 2 - (2 * np.log(2) - 1) * np.log(df_p.Close / df_p.Open) ** 2).astype('float32')
    df_p = df_p.drop(columns=['Open','High','Low','Close','timestamp'])
    plt.plot(np.log(df_p.rolling(2400).mean()),label=asset_details[asset_details.Asset_ID == i].Asset_Name.values[0])
    
plt.legend(loc='upper left',fontsize='x-small')
plt.show()

# Target

In [None]:
import seaborn as sns

sns.color_palette("tab10")
sns.kdeplot(data=train_df, x='Target', hue='Asset_ID', fill=True, common_norm=False, alpha=0.4)
plt.show()

# Market

In [None]:
%%time

features_to_aggregate = ['log_ret']
train_df['log_ret'] = np.log(train_df.Close/train_df.Open)

dict_weights = {}

for i in range(asset_details.shape[0]):
    dict_weights[asset_details.iloc[i,0]] = asset_details.iloc[i,1]
    
train_df['weights'] = train_df.Asset_ID.map(dict_weights).astype('float32')

t, w, A_id = (train_df[col].values for col in ['timestamp','weights','Asset_ID'])
ids, index = np.unique(t, return_index=True)

Values = train_df[features_to_aggregate].values
splits = np.split(Values, index[1:])
splits_w = np.split(w, index[1:])
splits_A_id = np.split(A_id, index[1:])

ret_small = []
ret = []

for time_id, x, w, A_id in zip(ids.tolist(), splits, splits_w, splits_A_id):
    outputs = np.float32(np.sum((x.T*w),axis=1)/sum(w))
    ret_small.append(outputs)
    ret.append(np.tile(outputs, (len(w), 1)))


In [None]:
market_ret = pd.DataFrame({'timestamp':ids,'log_ret_M':np.concatenate(ret_small,axis=0).flatten()})

In [None]:
#train_df['log_ret_M'] = np.concatenate(ret,axis=0)

In [None]:
market_ret.index = market_ret.timestamp

# Price on log scale / compared to market

In [None]:
train_df.head()

# Cumulated log returns

In [None]:
for i in train_df.Asset_ID.unique():
    df_p = train_df[['log_ret','timestamp']][train_df.Asset_ID==i]
    df_p.index = df_p.timestamp
    df_p = df_p.drop(columns=['timestamp'])
    plt.plot(df_p.cumsum(),label=asset_details[asset_details.Asset_ID == i].Asset_Name.values[0])
    
plt.plot(market_ret.log_ret_M.cumsum(),label='Market',c='Black')
plt.legend(loc='upper left',fontsize='x-small')
plt.show()

# Correlation with target - Beta

Residualised Log returns appears generally negatively correlated with the target. I exploit this idea in this ultra fast baseline notebook: https://www.kaggle.com/lucasmorin/minimal-fast-baseline

In [None]:
train_df['log_ret_M'] = np.concatenate(ret,axis=0).flatten()
train_df['log_ret_M2'] = train_df['log_ret_M']**2
train_df['log_ret_Mr'] = train_df['log_ret_M'] * train_df['log_ret']

In [None]:
for i in train_df.Asset_ID.unique():
    df_p = train_df[['Asset_ID','log_ret_Mr','log_ret_M2','timestamp']][train_df.Asset_ID==i]
    df_p['mr_rolling'] = df_p['log_ret_Mr'].rolling(window=3750, min_periods=3750).mean()
    df_p['m2_rolling'] = df_p['log_ret_M2'].rolling(window=3750, min_periods=3750).mean()
    df_p['beta'] = df_p['mr_rolling'] / df_p['m2_rolling']
    
    plt.plot(df_p['beta'],label=asset_details[asset_details.Asset_ID == i].Asset_Name.values[0])
    
plt.legend(loc='upper left',fontsize='x-small')
plt.show()

In [None]:
for i in train_df.Asset_ID.unique():
    
    print('Asset: '+asset_details[asset_details.Asset_ID == i].Asset_Name.values[0])
    
    
    df_p = train_df[['log_ret','log_ret_M','Target','timestamp']][train_df.Asset_ID==i]
    
    df_p.index = df_p.timestamp
    
    df_p['log_ret_res'] = df_p['log_ret'] - df_p['log_ret_M']
    
    #corr_time = df_p.groupby(df_p.index//(3750*60)).corr().loc[:,"Target"].loc[:,'log_ret']
    #corr_time2 = df_p.groupby(df_p.index//(3750*60)).corr().loc[:,"Target"].loc[:,'log_ret_M']
    corr_time3 = df_p.groupby(df_p.index//(3750*60)).corr().loc[:,'log_ret'].loc[:,'log_ret_M']
    #corr_time4 = df_p.groupby(df_p.index//(3750*60)).corr().loc[:,"Target"].loc[:,'log_ret_res']
    
    #print('Asset: Correlation between target and timestamp log returns: '+str(corr_time.mean()))
    #print('Asset: Correlation between target and timestamp market log returns: '+str(corr_time2.mean()))
    print('Asset: Beta: '+str(corr_time3.mean()))
    #print('Asset: Correlation between target and timestamp residualisez log returns: '+str(corr_time4.mean()))
    
    """
    
    corr_time.plot();
    plt.xticks([])
    plt.ylabel("Correlation")
    plt.title("Correlation between target and timestamp log returns");
    plt.show();
    
    corr_time2.plot();
    plt.xticks([])
    plt.ylabel("Correlation")
    plt.title("Correlation between target and timestamp market log returns");
    plt.show();"""
    
    corr_time3.plot();
    plt.xticks([])
    plt.ylabel("Correlation")
    plt.title("Correlation between timestamp log return and timestamp market log return");
    #plt.show();
    
    """
    corr_time4.plot();
    plt.xticks([])
    plt.ylabel("Correlation")
    plt.title("Correlation between target and timestamp residualisez log returns");
    plt.show();"""

# Correlation

TO DO: over time

In [None]:
#create dataframe with returns for all assets
all_assets_2021 = pd.DataFrame([])
for asset_id, asset_name in zip(asset_details.Asset_ID, asset_details.Asset_Name):
  asset = train_df[train_df["Asset_ID"]==asset_id].set_index("timestamp")
  asset = asset.loc[totimestamp('01/01/2021'):totimestamp('01/05/2021')]
  asset = asset.reindex(range(asset.index[0],asset.index[-1]+60,60),method='pad')
  lret = log_return(asset.Close.fillna(0))[1:]
  all_assets_2021 = all_assets_2021.join(lret, rsuffix=asset_name, how="outer")
plt.imshow(all_assets_2021.corr());
plt.yticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values);
plt.xticks(asset_details.Asset_ID.values, asset_details.Asset_Name.values, rotation='vertical');
plt.colorbar();