In [None]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# debug mode setting
is_dev = False
nrows = 10000

# Data Description
https://www.kaggle.com/c/g-research-crypto-forecasting/data?select=train.csv

# train.csv

In [None]:
# train.csv
input_dir = Path('../input/g-research-crypto-forecasting')
if is_dev:
    df_train = pd.read_csv(input_dir / 'train.csv', nrows=nrows)
else:
    df_train = pd.read_csv(input_dir / 'train.csv')
df_train.shape

In [None]:
list(df_train.columns)

In [None]:
df_train.head()

In [None]:
# check missing value
df_train.isnull().sum()

# example_test.csv

In [None]:
df_test = pd.read_csv(input_dir / 'example_test.csv')
df_test.shape

In [None]:
df_test.head()

In [None]:
df_test.group_num.value_counts()

# example_sample_submission

In [None]:
df_sample_submission = pd.read_csv(input_dir / 'example_sample_submission.csv')
df_sample_submission.shape

In [None]:
df_sample_submission.head()

# asset_details.csv

In [None]:
df_asset = pd.read_csv(input_dir / 'asset_details.csv')
df_asset.shape

In [None]:
df_asset.head()

# supplemental_train.csv

In [None]:
if is_dev:
    df_sup_train = pd.read_csv(input_dir / 'supplemental_train.csv', nrows=nrows)
else:
    df_sup_train = pd.read_csv(input_dir / 'supplemental_train.csv')
df_sup_train.shape

In [None]:
df_sup_train.head()

In [None]:
df_sup_train.isnull().sum()

# Merge train.csv and asset_details.csv

In [None]:
df_train = df_train.merge(df_asset.drop('Weight', axis=1), on='Asset_ID')
df_train['timestamp'] = df_train['timestamp'].astype('datetime64[s]')
df_train = df_train.sort_values(['timestamp'])
df_train.shape

# Plot data

In [None]:
binance_coin = df_train.query(f'Asset_ID == 0')
bit_coin = df_train.query(f'Asset_ID == 1')
bit_cash = df_train.query(f'Asset_ID == 2')
eos_io = df_train.query(f'Asset_ID == 5')
eth_classic = df_train.query(f'Asset_ID == 7')

currencies = {'binance_coin': binance_coin,
              'bit_coin': bit_coin,
              'bit_cash': bit_cash,
              'eos_io': eos_io,
              'eth_classic': eth_classic
             }

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
plt.rcParams["font.size"] = 15

In [None]:
def plot_lines(df, index, columns, title, tail=None):
    if tail is not None:
        df_plot = df.copy()[-tail:]
    else:
        df_plot = df.copy()
    df_plot = df_plot.set_index(index)[columns]
    plt.figure(figsize=(20, 10))
    if title is not None:
        plt.title(title)
    sns.lineplot(data=df_plot)

In [None]:
index = 'timestamp'
columns = ['Target']
plot_lines(bit_coin, index, columns, 'bit_coin', tail=100000)

In [None]:
index = 'timestamp'
columns = ['High', 'Open', 'Close', 'Low']
plot_lines(bit_coin, index, columns, 'bit_coin', tail=1000)

In [None]:
def plot_line_dict(dict_df, index, columns, tail=None):
    i = int(np.ceil(len(dict_df) / 2))
    j = 2
    fig, axes = plt.subplots(i, j, figsize=(40, 40))
    keys = dict_df.keys()
    for idx, key in enumerate(keys):
        df = dict_df[key]
        if tail is not None:
            df_plot = df.copy()[-tail:]
        else:
            df_plot = df.copy()
        df_plot = df_plot.set_index(index)[columns]
        ax = axes[idx // j][idx % j]
        ax.set_title(key)
        sns.lineplot(data=df_plot, ax=ax)

In [None]:
plot_line_dict(currencies, index, columns, tail=1000)

## Volume

In [None]:
def plot_line_pivot(df, index, divide_col, value, tail=None):
    if tail is not None:
        df = df[-tail:]
    df = pd.pivot_table(df, index=index, columns=[divide_col], values=value)
    plt.figure(figsize=(20, 10))
    sns.lineplot(data=df)

In [None]:
plot_line_pivot(df_train, index, divide_col='Asset_Name', value='Volume', tail=10000)

## Amount

In [None]:
df_train['Estimated_Amount'] = df_train['Volume'] * df_train['Close']

In [None]:
df_train.groupby('Asset_Name')['Estimated_Amount'].mean().astype(int).sort_values(ascending=False).plot.bar()

In [None]:
plot_line_pivot(df_train, index, divide_col='Asset_Name', value='Estimated_Amount', tail=10000)

## Correlations between Assets

In [None]:
df_train_pivot = df_train.pivot_table(index=['timestamp'], columns=['Asset_Name'], values='Target', aggfunc='max')

In [None]:
df_train_pivot

In [None]:
tail_num = 1000
array_scatter_matrix = pd.plotting.scatter_matrix(df_train_pivot.fillna(0).tail(tail_num), alpha=0.2, figsize=(30, 30))

In [None]:
corr = df_train_pivot.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(20, 15))
    ax = sns.heatmap(corr, mask=mask, square=True, annot=True, cmap="YlGnBu")