# G-Research Time Series-Specific Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import time

## Data Wrangling

### Get the Data

Available train data inclues test data, this leading to astounding scores on the public leaderboard. In time series, such a phenomenon when info about the future is leaked to our analysis or training process is called a lookahead. A lookahead is a way, through data, to find out something about the future earlier thant you ought to know it. Information about what will happen in the future propagates back in time in our modelling and affects how our model behaves earlier in time (generally tends to faking model performance improvement). Here, data posterior to `2021-06-13` is leaky, i.e. it is data bout the future that should not be used for anything else than testing models or assumptions. The `read_csv_strict()` function designed by [dataista0](https://www.kaggle.com/julian3833) helps in avoiding the caveat of using future data for training models.

In [None]:
path = '../input/g-research-crypto-forecasting/'

def read_csv_strict(file_name='train.csv'): 
    df = pd.read_csv(path+file_name)
    for asset in df['Asset_ID'].unique():
        df.loc[df['Asset_ID']==asset, 'datetime'] = pd.to_datetime(df.loc[df['Asset_ID']==asset,'timestamp'], unit='s')
    df = df[df['datetime']<'2021-06-13 00:00:00']
    df.drop('timestamp', axis=1, inplace=True)
    return df

In [None]:
#  Load train data and asset details
train_data = read_csv_strict()
asset_details = pd.read_csv(path+'asset_details.csv')

In [None]:
train_data.head()

In [None]:
asset_details.head()

In [None]:
# Join train and asset details to get asset names and weight alongside their time series
train_data = pd.merge(train_data, asset_details, on='Asset_ID')
train_data.set_index('datetime', inplace=True)

## Exploratory Data Analysis

### Time Series-Specific Exploratory Methods

Time series can be approached in a more specific manner by looking at values at different times in a given series. More precisely, relationships between values at different times in the same series constitute the basis of what we're going to study in the following part of the notebook.

#### Plots, plots, plots

In [None]:
# Global plot
# logarithm is for visual considerations

f = plt.figure(figsize=(20, 8))
for asset in train_data['Asset_Name'].unique():
    plt.plot(np.log(train_data[train_data['Asset_Name']==asset]['Close']), label=asset);
plt.title('Assets close value evolution over time');
plt.xlabel('Time');
plt.ylabel('Close value');
plt.legend();

In [None]:
# Seasonal plots
# Daily average Close
for asset in train_data['Asset_Name'].unique():
    f, ax = plt.subplots(figsize=(10,7))
    t = train_data[train_data['Asset_Name']==asset]
    for year in t.index.year.unique():
        y = t[t.index.year==year]
        y.groupby([y.index.month, y.index.day])['Close'].mean().plot(label=str(year))
        
        plt.title(f'{asset} daily average close value yearly plot')
        plt.xlabel('Time');
        plt.ylabel(f'{asset} daily average close value');
        plt.legend();

In [None]:
# Seasonal plots
# Monthly average Close
for asset in train_data['Asset_Name'].unique():
    f, ax = plt.subplots(figsize=(10,7))
    t = train_data[train_data['Asset_Name']==asset]
    for year in t.index.year.unique():
        y = t[t.index.year==year]
        y.groupby([y.index.month])['Close'].mean().plot(label=str(year))
        
        plt.title(f'{asset} monthly average close value yearly plot')
        plt.xlabel('Time');
        plt.ylabel(f'{asset} monthly average close value');
        plt.legend();

#### Log returns

In [None]:
# Compute log of return = percentage change
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

In [None]:
for asset in train_data['Asset_Name'].unique():
    f, ax = plt.subplots(figsize=(10,7))
    log_return(train_data[train_data['Asset_Name']==asset]['Close'], periods=1)[1:].plot() # First row has an empty return as previous value is unknown
    plt.title(f'{asset} monthly average close value daily log return')
    plt.xlabel('Time');
    plt.ylabel('log(pct_change)');
    plt.legend();