In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let us start by taking a look, how the target values evolve over time per currency.

In [None]:
import os
import numpy as np
import pandas as pd
import gc

dtypes = {
    'timestamp': np.int64,
    'Asset_ID': np.int8,
     'Count': np.int32,
     'Open': np.float64,
     'High': np.float64,
     'Low': np.float64,
    'Close': np.float64,
     'Volume': np.float64,
     'VWAP': np.float64,
    'Target': np.float64,
}
data = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/train.csv', dtype=dtypes, usecols=list(dtypes.keys()))
data['Time'] = pd.to_datetime(data['timestamp'], unit='s')

details = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv')

data = pd.merge(data, 
                details, 
                on ='Asset_ID', 
                how ='left')

print(data.head())
print(details)

In [None]:
import matplotlib.pyplot as plt

# Subplots are organized in a Rows x Cols Grid
# Tot and Cols are known
Tot = len(details.Asset_ID)
Cols = 4


# Compute Rows required
Rows = Tot // Cols 
Rows += Tot % Cols

# Create a Position index
Position = range(1,Tot + 1)

# Create main figure
fig = plt.figure(1)
fig.set_figheight(20)
fig.set_figwidth(20)

for k in range(Tot):
  # add every single subplot to the figure with a for loop

    tmp_df = data[data.Asset_ID == details.Asset_ID[k]]
    ax = fig.add_subplot(Rows,Cols,Position[k])
    ax.plot(tmp_df.Time, tmp_df.Target)
    ax.set_title(details.Asset_Name[k])

plt.show()

del tmp_df

In my opinion, we see very typical things here with respect to financial data. Time series plots of log-returns for assets typically look similar, i.e., the arithmetic mean is close to zero, while we observe volatility clustering which means in certain time periods volatility is higher than in others. Let us take a look at the distributions for the target variable.

In [None]:
import matplotlib.pyplot as plt

# Subplots are organized in a Rows x Cols Grid
# Tot and Cols are known
Tot = len(details.Asset_ID)
Cols = 4


# Compute Rows required
Rows = Tot // Cols 
Rows += Tot % Cols

# Create a Position index
Position = range(1,Tot + 1)

# Create main figure
fig = plt.figure(1)
fig.set_figheight(20)
fig.set_figwidth(20)

for k in range(Tot):
  # add every single subplot to the figure with a for loop

    tmp_df = data[data.Asset_ID == details.Asset_ID[k]]
    ax = fig.add_subplot(Rows,Cols,Position[k])
    ax.hist(tmp_df.Target, bins = 50)
    ax.set_xlim(-0.1, 0.1)
    ax.set_title(details.Asset_Name[k])

plt.show()

del tmp_df

Most of the currencies exhibit rather symmetric distributions, yet, it looks like all currencies exhibit excess curtosis which means extreme events are more likely than under the assumption of a normal distribution. Now let us take a look if target values are correlated at each point in time.

In [None]:
all_timestamps = np.sort(data['timestamp'].unique())
targets = pd.DataFrame(index=all_timestamps)

for i, id_ in enumerate(details.Asset_ID):
    asset = data[data.Asset_ID == id_].set_index(keys='timestamp')
    price = pd.Series(index=all_timestamps, data=asset['Close'])
    targets[details.Asset_Name[i]] = (
        price.shift(periods=-16) /
        price.shift(periods=-1)
    ) - 1
    
print(targets.head())

import seaborn as sns

sns.heatmap(targets.corr())
plt.show()

We can see that target values for each currency exhibit high correlations in the cross-section. Unfortunately, this does not support us when making future predictions, as information in the cross-section is contemporaneous and not predictive. Now, let us try to find out, how systematic market movements of currencies behave. To do so, I simply normalize "Close"-values for each currency and calculate the naive (equally-weighted) mean for all currencies which are available over a point in time. Finally, I visualize this. Obviously, this is rather a rough quick and dirty method for checking systematic market behavior.

In [None]:
from sklearn.preprocessing import MinMaxScaler

closing_prices = pd.DataFrame(index=all_timestamps)

for i, id_ in enumerate(details.Asset_ID):
    asset = data[data.Asset_ID == id_].set_index(keys='timestamp')
    price = pd.Series(index=all_timestamps, data=asset['Close'])
    closing_prices[details.Asset_Name[i]] = price
    
min_max_scaler = MinMaxScaler()
closing_prices_ = min_max_scaler.fit_transform(closing_prices)
plt.plot(closing_prices.index, np.mean(closing_prices_, axis = 1))

I would say that we may observe a little systematic behavior which looks similar to economic cycles. This information may be further refined and used when splitting the data. Next, let us use Bitcoin as an example to check the raw data for possible predictive power.

In [None]:
btc = data[data.Asset_ID == 1]
btc.set_index('timestamp', inplace = True)
btc.head()

Taking a look at the subplots below indicate that alle variables except the target variable are non-stationary which is bad for making models that generalize over time.

In [None]:
btc[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Target']].plot(subplots = True)

So, I tried first differences, but even those values do not seem to be stationary, so I directly tried the first log-differences below. Strict stationarity means the (unconditional) distribution stays the same over time for a given variable. Weak stationarity means at least the mean, variance and autocovariance are the same over time.

In [None]:
first_log_differences = btc[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].apply(np.log).diff()
first_log_differences.plot(subplots = True)

In the plots below, I want to take a look if approximately the mean and the standard deviation are constant over time. For most of the time this seems to be the case, but there are some breaktroughs at some points in time which can really have a negative effect if we would use these variables for prediction.

In [None]:
first_log_differences.rolling(7200).mean().plot(subplots = True)

In [None]:
first_log_differences.rolling(7200).std().plot(subplots = True)

Let us check if the feature variables (here the log-differences) exhibit any correlation to the target values.

In [None]:
new_data = btc[['Target']].merge(first_log_differences, left_index = True, right_index = True)
sns.heatmap(new_data.corr())
plt.show()

Unfortunately, this does not seem to be the case. So, I further check if there is some correlation between the past of feature variables and the current target variable. To do so, I write a function which approximately calculates auto-cross-correlation. To be honest, I do not know if this is actually defined somewhere in statistics, however, what I want to find out if there is any (linear) relationship between past observations of feature variables and the current target obsevation.

In [None]:
def cross_autocorr(df, targetname, varname, lag = 1):
    df.dropna(inplace = True)
    x_m = df[targetname].mean()
    y_m = df[varname].mean()

    x_s = df[targetname].std()
    y_s = df[varname].std()

    return (x_s * y_s)**(-1) * np.mean((df[targetname].values[lag:] - x_m) * (df[varname].values[:-lag] - y_m))

cross_autocorrs = {}
for var in ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']:
    autocorrs_tmp = []
    for l in range(1, 30, 1):
        autocorrs_tmp.append(cross_autocorr(new_data, 'Target', var, lag = l))
        
    cross_autocorrs[var] = autocorrs_tmp
    
sns.heatmap(pd.DataFrame(cross_autocorrs, index = range(1, 30, 1)))

As we all may have guessed, this isn't the case! So, we are forced to create some features which may have at least a little correlation to the target variable. Otherwise, the best and most sophisticated model will fail to learn about making good predictions. So, good luck everyone in finding those features and if you find them let me know;)