In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install plotly

# G-Research Crypto Forecasting

Cryptocurrencies are very popular but have the disavantage to be more volatile than classical stock prices. The first cryptocurrency, Bitcoin, was created in 2008 by Satoshi Nakamoto. Bitcoin is created as a decentralized system based on the blockchain technology. However, in the next years, creation of many other cryptocurrencies has been exploded. The expansion and democratization of these methods have opened new horizons for stock market.


In this Kaggle competition, the aim is to predict trends about cryptocurrencies by using the most well-knowledge currencies. It is important to keep in mind that forecasting stock market is a hard task and thus, forecasting trends in cryptocurrencies is harder.


# Datasets


In this competition, 5 files are available :


A first train set,

A supplementary train set,

An asset set with information about cryptocurrencies,

An example test,

An example test submission.

Only three datasets (the two train sets & the asset set) were used in this exploration.


Configuration and Utils

In [None]:
# system libraries
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.disable(logging.CRITICAL)
import gc

# graphical libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from functools import reduce
from datetime import datetime
import time

In [None]:
dtype_dict ={"open" : "float16",
             "close" : "float16",
             "high" : "float16",
             "low" : "float16",
             "volume" : "float16",
             "vwap" : "float16",
             "target" : "float16"}

In [None]:
def crypto_df(asset_id, data):
    # Copied from https://www.kaggle.com/odins0n/g-research-plots-eda
    df = data[data["asset_name"] == asset_id].reset_index(drop=True)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.set_index('timestamp')
    return df


def candelstick_chart(data, title):
    # Copied from https://www.kaggle.com/odins0n/g-research-plots-eda
    candlestick = go.Figure(data=[go.Candlestick(x=data.index,
                                                 open=data[('open')],
                                                 high=data[('high')],
                                                 low=data[('low')],
                                                 close=data[('close')])])
    candlestick.update_xaxes(title_text='Time',
                             rangeslider_visible=False)

    candlestick.update_layout(
        title={
            'text': '{:} Candelstick Chart'.format(title),
            'y': 0.90,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'})

    candlestick.update_yaxes(title_text='Price in USD', ticksuffix='$')
    return candlestick


def vol_traded(data, title, color):
    # Copied from https://www.kaggle.com/odins0n/g-research-plots-eda
    area = px.area(data_frame=data,
                   x=data.index,
                   y="volume")
    area.update_traces(line_color=color)
    area.update_xaxes(
        title_text='Time',
        rangeslider_visible=False)
    area.update_yaxes(title_text='Number of trades every minute')
    area.update_layout(showlegend=True,
                       title={
                           'text': '{:} Volume Traded'.format(title),
                           'y': 0.94,
                           'x': 0.5,
                           'xanchor': 'center',
                           'yanchor': 'top'})
    return area

# 1) Preprocessing

In [None]:
# Importing datasets

df = pd.read_csv("../input/g-research-crypto-forecasting/train.csv", dtype=dtype_dict)
df_sup = pd.read_csv("../input/g-research-crypto-forecasting/supplemental_train.csv", dtype=dtype_dict)
asset = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv")
df.head()

In [None]:
df.shape

Columns of the dataset :


* timestamp - A timestamp for the minute covered by the row.
* Asset_ID - An ID code for the cryptoasset.
* Count - The number of trades that took place this minute.
* Open - The USD price at the beginning of the minute.
* High - The highest USD price during the minute.
* Low - The lowest USD price during the minute.
* Close - The USD price at the end of the minute.
* Volume - The number of cryptoasset units traded during the minute.
* VWAP - The volume weighted average price for the minute.
* Target - 15 minute residualized returns.

In [None]:
# Creating an unique df with data
df = df.append(df_sup,
               ignore_index=True,
               verify_integrity=True)

In [None]:
df.shape

In [None]:
# Merging dataset to obtain name of each cryptocurrency
df = pd.merge(df, asset, on="Asset_ID")
df = df[["timestamp", "Asset_Name", "Count",
         "Open", "High", "Low",
         "Close", "Volume", "VWAP",
         "Target"]]
df.columns = ["timestamp", "asset_name", "count",
              "open", "high", "low",
              "close", "volume", "vwap",
              "target"]
df.head()

In [None]:
# Missing values exploration
df.isnull().mean() * 100

In [None]:
# Deleting missing data
df.dropna(inplace=True)

# 2) Exploratory Analysis

a) Candlestick representation

In this part, data will be visualised with the help of candlestick graphs. This kind of graph is used to see information about open, close, high and low values of a currency during a certain period of time. Here, the last 120 minutes were reprensented. To realize the graph, a dataframe for each currency was created. Then, Plotly library was used to displayed the graphs

In [None]:
btc = crypto_df("Bitcoin", data=df)
eth = crypto_df("Ethereum", data=df)
iota = crypto_df("IOTA", data=df)

In [None]:
btc_plot = candelstick_chart(btc[-120:], title="Bitcoin")
btc_plot.show()

In [None]:
eth_plot = candelstick_chart(eth[-120:], title="Ethereum")
eth_plot.show()

In [None]:
iota_plot = candelstick_chart(iota[-120:], title="IOTA")
iota_plot.show()

Visually, we can conclude that some currencies are more stable than others. The ups and downs seems to follow the same series but the amplitude are variable. For instance, IOTA shows great difference between the parameters ohlc.


b) Volume exchange

Second visualisation is about volume traded during the last 60 minutes.


Bitcoin

In [None]:
vol_traded(btc[-60:], "Bitcoin", color="Blue")

In [None]:
vol_traded(eth[-60:], "Ethereum", color="Green")

In [None]:
vol_traded(iota[-60:], "IOTA", color="Green")

c) Central tendency analysis

In [None]:
df.groupby("asset_name").mean()

In [None]:
df.groupby("asset_name").median()

In [None]:
df.groupby("asset_name").std()

d) Correlation between cryptocurrencies

In [None]:
df_close = df.copy()
df_close = df_close[["timestamp", "asset_name", "close"]]
df_close["timestamp"] = df_close["timestamp"].apply(
    lambda x: datetime.fromtimestamp(x))
df_close["date"] = df_close["timestamp"].astype('datetime64[s]')
df_close.set_index("date", inplace=True)
df_close.drop("timestamp", axis=1, inplace=True)
df_close = df_close[["asset_name", "close"]]
df_close["close"] = df_close["close"].apply(lambda x: np.log(x))

In [None]:
df_close_2021 = df_close.loc['01/01/2021':'01/06/2021']
df_close_2021.reset_index(inplace=True)
df_close_2021 = pd.pivot_table(
    df_close_2021, values="close", columns="asset_name", index="date")

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(df_close_2021.diff().corr(),
            linewidths=.5,
            annot=True,
            square=True,
            cmap="viridis")
plt.xlabel("Asset Name")
plt.ylabel("Asset Name")
plt.title("Correlation matrix between each cryptocurrency during the first semester (2021)")
plt.show()

In [None]:
del df_close
del df_close_2021
gc.collect()

# 3) Features Engineering

In [None]:
dataset = [btc, eth, iota]


cols = ['open', 'close', "high", "low"]

for i in dataset:
    for j in cols:
        i["log_"+j] = i[j].apply(lambda x: np.log(x))

* Stock High minus Low price (H-L)
* Stock Close minus Open price (O-C)
* Stock price’s seven days’ moving average (7 DAYS MA)
* Stock price’s fourteen days’ moving average (14 DAYS MA)
* Stock price’s twenty one days’ moving average (21 DAYS MA)
* Stock price’s standard deviation for the past seven days (7 DAYS STD DEV)

In [None]:
for i in dataset:
    i["H-L"] = i["log_high"] - i["log_low"]
    i["O-C"] = i["log_open"] - i["log_close"]
    i["MA_7d"] = i["log_close"].rolling(10080).mean()
    i["MA_14d"] = i["log_close"].rolling(20160).mean()
    i["MA_21d"] = i["log_close"].rolling(30240).mean()
    i["STD_7d"] = i["log_close"].rolling(10080).std()

In [None]:
data = btc.append(eth)
data = data.append(iota)

In [None]:
data.tail(3)