# G-Research Crypto Forecasting Data Wrangling and Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import time

## Data Wrangling

### Get the Data

Available train data inclues test data, this leading to astounding scores on the public leaderboard. In time series, such a phenomenon when info about the future is leaked to our analysis or training process is called a **lookahead**. A lookahead is a way, through data, to find out something about the future earlier thant you ought to know it. Information about what will happen in the future propagates back in time in our modelling and affects how our model behaves earlier in time (generally tends to faking model performance improvement). Here, data posterior to *2021-06-13* is leaky, i.e. it is data bout the future that should not be used for anything else than testing models or assumptions. The `read_csv_strict()` function designed by [dataista0](https://www.kaggle.com/julian3833) helps in avoiding the caveat of using future data for training models.

In [None]:
path = '../input/g-research-crypto-forecasting/'

def read_csv_strict(file_name='train.csv'): 
    df = pd.read_csv(path+file_name)
    for asset in df['Asset_ID'].unique():
        df.loc[df['Asset_ID']==asset, 'datetime'] = pd.to_datetime(df.loc[df['Asset_ID']==asset,'timestamp'], unit='s')
    df = df[df['datetime'] < '2021-06-13 00:00:00']
    return df

In [None]:
# Load train data and asset details
train_data = read_csv_strict()
asset_details = pd.read_csv(path+'asset_details.csv')

In [None]:
train_data.head()

In [None]:
asset_details.head()

In [None]:
# Join train and asset details to get asset names and weight alongside their time series
train_data = pd.merge(train_data, asset_details, on='Asset_ID')
train_data = train_data[[train_data.columns.tolist()[0], train_data.columns.tolist()[-1], train_data.columns.tolist()[-2]]+train_data.columns.tolist()[2:-2]]

### Looking for missing data

In [None]:
# By asset, which range of data do we have in the dataset (in minutes)?
# How many data points are recorded for each asset (1 data point record = 1 minute)?
grouped_stats = train_data.groupby('Asset_Name')['timestamp'].agg([('timestamp', lambda x: (np.min(pd.to_datetime(train_data['timestamp'], unit='s')))), ('timestamp', lambda x: (np.max(pd.to_datetime(train_data['timestamp'], unit='s')))),
                                                                   ('timestamp', lambda x: int((np.max(pd.to_datetime(train_data['timestamp'], unit='s')) - np.min(pd.to_datetime(train_data['timestamp'], unit='s'))).total_seconds()/60)),
                                                                   ('timestamp', lambda x: x.count())])
grouped_stats.columns = ['start_date', 'end_date', 'nb_minutes', 'nb_datapoints']
grouped_stats['missing_datapoints'] = grouped_stats['nb_minutes'] - grouped_stats['nb_datapoints'] + 1

In [None]:
grouped_stats

In [None]:
# Data missing for many assets, let's see if it's really true
for asset in train_data['Asset_Name'].unique():
    df_asset = train_data[train_data['Asset_Name']==asset].set_index('timestamp')
    print(asset, (df_asset.index[1:]-df_asset.index[:-1]).value_counts().head(2))
    print()

### Data Imputation

In [None]:
# Let's fill missing asset values using forward fill
# value imputed is the last valid value, this prevents any lookahead (that would have been introduced by backward fill for instance)
train = pd.DataFrame([])
for asset in train_data['Asset_Name'].unique():
    df_asset = train_data[train_data['Asset_Name']==asset].set_index('timestamp')
    df_asset = df_asset.reindex(range(df_asset.index[0], df_asset.index[-1]+60, 60), method='ffill')
    print(asset, (df_asset.index[1:]-df_asset.index[:-1]).value_counts().head()) # Are gaps filled correctly?
    train = pd.concat([train, df_asset])
train.reset_index(level=0, inplace=True)

## Exploratory Data Analysis

### Classical Methods

First we'll approach this data set with classical, non time series-specific methods.

In [None]:
# Let's visualise some of the data at hand
# Let's focus on 2021 Bitcoin and Monero
btc = train[train['Asset_Name']=='Bitcoin'].set_index('timestamp')
mnr = train[train['Asset_Name']=='Monero'].set_index('timestamp')
eth = train[train['Asset_Name']=='Ethereum'].set_index('timestamp')

In [None]:
print(f'Bitcoin NaN values: {btc.isna().sum()[btc.isna().sum()!=0]}')
print(f'Monero NaN values: {mnr.isna().sum()[mnr.isna().sum()!=0]}')
print(f'Ethereum NaN values: {eth.isna().sum()[eth.isna().sum()!=0]}')


print('\nDropping NaN values...')
for df in [btc, mnr, eth]:
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    
print('...done.')

In [None]:
# log of Close values is chosen because of the different price scales
f = plt.figure(figsize=(10,7))
plt.plot(np.log(btc['Close']), label='BTC');
plt.plot(np.log(mnr['Close']), label='MNR');
plt.plot(np.log(eth['Close']), label='ETH');
plt.title('Closing asset value over time');
plt.xlabel('Time');
plt.ylabel('log(close)');
plt.legend();

In [None]:
# Are percentage changes in prices kind of correlated as would Close values be ?
f = plt.figure(figsize=(10,7))
plt.plot(btc['Close'].pct_change(), label='BTC');
plt.plot(mnr['Close'].pct_change(), label='MNR');
plt.plot(eth['Close'].pct_change(), label='ETH');
plt.title('Percentage changes in asset closing values over time');
plt.xlabel('Time');
plt.ylabel('Close % change');
plt.legend();

In [None]:
# Histogram of Close values for each asset
f = plt.figure(figsize=(10,7))
np.log(btc['Close']).hist(label='BTC');
np.log(mnr['Close']).hist(label='MNR');
np.log(eth['Close']).hist(label='ETH');
plt.xlabel('log(close)');
plt.ylabel('Count');
plt.title('Distribution of asset closing values');
plt.legend()
plt.grid(False);

In [None]:
# Histogram of Close value changes for each asset
f = plt.figure(figsize=(10,7))
btc['Close'].pct_change().hist(alpha=0.5, label='BTC');
mnr['Close'].pct_change().hist(alpha=0.3, label='MNR');
eth['Close'].pct_change().hist(alpha=0.3, label='ETH');
plt.xlabel('Close % change');
plt.ylabel('Count');
plt.title('Distribution of percentage changes in closing asset values');
plt.legend();
plt.grid(False);

In [None]:
# So apparently % changes are very very centered around 0...
# Maybe it's due to the recording resolution, and such changes would appear higher with weekly sampled data

In [None]:
# Dataset for correlations between BTC and ETH
btc_eth = pd.merge(btc, eth, on='datetime', how='inner')
btc_eth.drop('timestamp_y', axis=1, inplace=True)

cols = list()
for col in btc_eth.columns:
    if col == 'timestamp_x':
        col = col.replace('_x', '')
    else:
        col = col.replace('_x', '_BTC').replace('_y', '_ETH')
    cols.append(col)
    
btc_eth.columns = cols

In [None]:
# Scatter plot of Close values between BTC and ETH
f = plt.figure(figsize=(10,7))
plt.scatter(x=btc_eth['Close_BTC'].pct_change(), y=btc_eth['Close_ETH'].pct_change());
plt.xlabel('BTC close value % change');
plt.ylabel('ETH close value % change');
plt.title('ETH close value vs. BTC close value.');

In [None]:
# ETH and BTC seem to be very positively linearly correlated...
btc_eth[['Close_BTC', 'Close_ETH']].corr()

In [None]:
# ...but this actually doesn't teach us much.
# Indeed, positive linear correlation depicts the fact that when BTC close value increases, so does ETH close value, and vice versa and when they decrease...
# ...but once BTC has increased, ETH has jointly increased, there are no predictive information here

In [None]:
# What about the correlation between percentage change in ETH close value and percentage change of the preceding record in BTC close value?
f = plt.figure(figsize=(10,7))
plt.scatter(x=btc_eth['Close_BTC'].pct_change(), y=btc_eth['Close_BTC'].pct_change().shift(-1));
plt.xlabel('Lagged BTC close value % change');
plt.ylabel('ETH close value % change');
plt.title('ETH close value vs. BTC close value.');