In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
import datetime
import scipy.stats as stats
from datetime import datetime

In [None]:
path = '../input/g-research-crypto-forecasting/'

In [None]:
!ls {path}

### Dimensions of the datasets

Let's start loading datasets and explore the dimensions !

In [None]:
df_train = pd.read_csv(path+"train.csv")

In [None]:
print(df_train.shape)
df_train.head()

The meanings of the features included in the set are the following:
* timestamp: All timestamps are returned as second Unix timestamps (the number of seconds elapsed since 1970-01-01 00:00:00.000 UTC). Timestamps in this dataset are multiple of 60, indicating minute-by-minute data.
* Asset_ID: The asset ID corresponding to one of the crytocurrencies (e.g. Asset_ID = 1 for Bitcoin). The mapping from Asset_ID to crypto asset is contained in asset_details.csv.
* Count: Total number of trades in the time interval (last minute).
* Open: Opening price of the time interval (in USD).
* High: Highest price reached during time interval (in USD).
* Low: Lowest price reached during time interval (in USD).
* Close: Closing price of the time interval (in USD).
* Volume: Quantity of asset bought or sold, displayed in base currency USD.
* VWAP: The average price of the asset over the time interval, weighted by volume. VWAP is an aggregated form of trade data.
* Target: Residual log-returns for the asset over a 15 minute horizon.

In [None]:
df_asset_details = pd.read_csv(path+"asset_details.csv")

In [None]:
print(df_asset_details.shape)
print(df_asset_details.sort_values("Asset_ID"))

In [None]:
assets = df_asset_details['Asset_Name'].tolist()

In [None]:
print(assets)

In [None]:
df_supp_train = pd.read_csv(path+"supplemental_train.csv")
print(df_supp_train.shape)
df_supp_train.head()

Does this dataset merely contain additional/supplemental rows for the training set ? Let's check the timestamps.

In [None]:
print(df_train['timestamp'].astype('datetime64[s]').max())
print(df_supp_train['timestamp'].astype('datetime64[s]').min())

It does not look like the records in supplemental set come after in time, the records in the training set.
Let's first explore what we have in the training set, then will come back to this !

In [None]:
df_test = pd.read_csv(path+"example_test.csv")
print(df_test.shape)
df_test.head()

Example test set has a column 'group_num' and an additional column in row_id, and missing target column from the training set.
Group_num column is a mystery to me for now. Let's take a look at the distinct values of this column.

In [None]:
print(set(df_test['group_num']))

I am removing the datasets other than the training set and the crypto-assets details set to avoid overusage of the memory.

Let us join the two datasets on the AssetID column.

In [None]:
df = df_train.set_index('Asset_ID').join(df_asset_details.set_index('Asset_ID'))

In [None]:
#dropping this unnecessary column
df = df.drop(['Weight'], axis = 1)
print(df.shape)
df.head()

#### Checking of Missing values 

In [None]:
df.isnull().sum()

Seems like there are no NULLs except in the Target column.

In [None]:
del(df_test)
del(df_supp_train)

### Statistical Analysis

Let us look at the distribution of the column for each of the crypto assets individually !

Let's check how daywise Open-Close is distributed for each crypto-asset ?

In [None]:
df['Diff'] = df['Close'] - df['Open']
df['timestamp'] = df['timestamp'].astype('datetime64[s]')
df['date'] = pd.to_datetime(df['timestamp']).dt.date

In [None]:
df.head()

#### Daywise Close - Open for all assets

In [None]:
grouped_df = df.groupby(['Asset_Name', 'date'], as_index = False).agg({'Open': ['min'], 'Close': ['max']})


In [None]:
grouped_df.columns

In [None]:
grouped_df['diff_val'] = grouped_df[('Close', 'max')] - grouped_df[('Open', 'min')]
grouped_df.head(10)

In [None]:
fig, ax = plt.subplots(nrows=14, ncols=1)
#ax = plt.axes()
count = 0

for asset in assets:
    
    grouped_df[grouped_df['Asset_Name'] == asset]['diff_val'].plot.density(figsize = (10, 30),
                       linewidth = 1.5, ax = ax[count], label = asset)
    ax[count].legend()
    ax[count].grid()
    count = count + 1


In [None]:
#To save memory
del(df_grouped_df)
del(fig, ax)

##### Interpretation of the above charts:
* For Bitcoin, the Daywise average difference between Close and Open has more weight towards the the positive scale and reaches as far as the value +20,000. On the negative side of the scale, it has picked up values even lower than -5000. This range is higher than any other asset.
* Ethereum Classic and Maker also show highe range of difference values and have considerable amount of skew towards the positive differences.

#### Distribution of number of trades

If the count column represents the trades made in a minute then let's look at the distribution of number of trades made on daily basis for all 14 cryptoassets.

In [None]:
grouped_df = df.groupby(['Asset_Name', 'date'], as_index = False).agg({'Count': ['sum']})

In [None]:
grouped_df.head()

In [None]:
fig, ax = plt.subplots(nrows=14, ncols=1)

count = 0

for asset in assets:
    
    grouped_df[grouped_df['Asset_Name'] == asset].plot(x = 'date', y = ('Count', 'sum'), figsize = (15, 50),
                                                      linewidth = 1.5, ax = ax[count], label = asset)
    
    ax[count].legend(loc='upper right')
    ax[count].grid()
    count = count + 1
#ax.legend(assets)

Intrepretation:
Number of trades have sharply increased at the beginning of the year 2021 compared to their corresponding levels before. Not only that, for all 14 assets the count of trades decreased in the middle of year 2021 and onwards. It showed be noted that we are talking about similarity in pattern here and not in count. The count varies from asset to asset. Bitcoin, Ethereum, and Ethereum Classic has more trades throughout the time in this set. 

### How Close varies in time for different assets ?

Let's investigate how the closed values varied in time for all 14 cryptocurrencies ?

In [None]:
fig, ax = plt.subplots(nrows=14, ncols=1)

count = 0

for asset in assets:
    
    df[df['Asset_Name'] == asset].plot(x = 'timestamp', y = 'Close', figsize = (15, 60),
                                                      linewidth = 1.5, ax = ax[count], label = asset)
    
    ax[count].legend(loc='upper right')
    ax[count].grid()
    count = count + 1

In [None]:
del(fig, ax)

#### Interpretations:
First, these results are shown regardless of what the opening value was !
Next, notice the similarity in patterns on how the Close values for the all the assets have increased in the beginning of the year 2021 and then abated in the middle of the year. This is in line with the number of trades made for each asset. There is a strong correlation then (*we haven't looked at it yet but we should quantify the correlation !*)

####  Log returns for different assets

In [None]:
# This is the function to define compute log returns as shared in the Tutorial notebook !
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

In [None]:
# removing asset_ID as index
df.reset_index(inplace = True)

In [None]:
# setting timestamp as my new index
df.set_index('timestamp', inplace=True)

In [None]:
# Here we go !
df.head()

In [None]:
fig, ax = plt.subplots(nrows=14, ncols=1)

count = 0

for asset in assets:
    
    sample_df = df[(df['date']>datetime.date(2021,1,1)) & (df['Asset_Name'] == asset)]
    log_sample = log_return(sample_df.Close)[1:]
    log_sample.plot(figsize = (15, 60), linewidth = 1.5, ax = ax[count], label = asset)
    
    ax[count].legend(loc='upper right')
    ax[count].grid()
    count = count + 1
    del(sample_df)

#### Interpretation:
The above charts show the log returns for all 14 assets, centered and fluctuating around zero all of them, across the entire time (*makes sense!*). We should have a probability distribution plot for this as well !!! might have helped more.

#### Correlation between Cryptocurrencies:
We first compute the correlation between all the assets for the samples of trade or minutes after Jan 1, 2021.

In [None]:
# create dataframe with returns for the above assets
assets_samples = pd.DataFrame([])

for i in range(0, len(assets)):
    
    asset_name = assets[i]
    df_1 = df[(df['date']>datetime.date(2021,1,1)) & (df['Asset_Name'] == asset_name)]
    lret_df = log_return(df_1.Close.fillna(0))[1:]
    lret_df.rename(asset_name)
    
    assets_samples = assets_samples.join(lret_df, rsuffix=asset_name, how = 'outer')
    
    del(df_1)
    del(lret_df)
    

In [None]:
assets_samples.columns = assets
assets_samples.head()

In [None]:
corr = assets_samples.corr()
corr.style.background_gradient(cmap='coolwarm')

As stated above, these correlation numbers are computed over a sample of time series data after the date Jan 1, 2021. However, in such a dataset, correlation between the columns / assets can vary in time.
Let us check the consistency in correlation in time for the assets that have high correlation between them. We set the threshold of 0.75 and shortlist the pairs that have correlation number higher than this.

* Bitcoin Cash - Litecoin
* Bitcoin - Litecoin
* Bitcoin - Ethereum
* Ethereum - Litecoin

In [None]:
pairs_assets = [('Bitcoin Cash', 'Litecoin'), ('Bitcoin', 'Litecoin'), ('Bitcoin', 'Ethereum'), ('Ethereum', 'Litecoin')]

In [None]:
print(pairs_assets)

In [None]:
good_pairs = len(pairs_assets)

fig, ax = plt.subplots(nrows=good_pairs, ncols=1)

count = 0

for i in range(good_pairs):
    df_1 = df[df['Asset_Name'] == pairs_assets[i][0]]
    lret_df_1 = log_return(df_1.Close)[1:]
    lret_df_1.rename('lret_df_1', inplace=True)
    
    df_2 = df[df['Asset_Name'] == pairs_assets[i][1]]
    lret_df_2 = log_return(df_2.Close)[1:]
    lret_df_2.rename('lret_df_2', inplace=True)
    
    two_assets = pd.concat([lret_df_1, lret_df_2], axis=1)
    two_assets.reset_index(inplace = True)
    
    two_assets['timestamp'] = two_assets.timestamp.astype('int64') // 10**9
    two_assets.set_index('timestamp')
    
    # group consecutive rows and use .corr() for correlation between columns
    corr_time = two_assets.groupby(two_assets.index//(1000*20)).corr().loc[:,"lret_df_1"].loc[:,"lret_df_2"]

    #corr_time.plot()
    corr_time.plot(figsize = (15, 25), linewidth = 1.5, ax = ax[count], label = str(pairs_assets[i]), color = 'r')
    ax[count].legend(loc='upper left')
    ax[count].set_xlabel('TimeStamp')
    ax[count].set_ylabel("Correlation Coefficient")
    ax[count].set_title("Correlation between the two assets"+str(pairs_assets[i]))
    ax[count].grid()
    
    count = count + 1
    
    del(df_1)
    del(df_2)
    del(lret_df_1)
    del(lret_df_2)
    del(two_assets)
    del(corr_time)

Interestingly ! They have different correlation numbers in time but they have a very similar pattern. We can:
* Make finer the granularity level by wich the timestamps are grouped and check to what extent it holds.
* Decrease the threshold on the correlation computed on the latest sample to include more pairs in the list and see how the pattern varies.
* In the time window where these assets-pairs have low correlations, which other assets show higher correlation to these assets.