In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this competition, we will be forecasting return on 14 popular crypto currencies. We have train.csv data as always which is the main data. Since it has around 3 GBs of data, it is always good to start with small data set. We can either use supplemental_train.csv or we can generate our own sample data also.

In [None]:
import subprocess

proc       = subprocess.Popen(["wc -l /kaggle/input/g-research-crypto-forecasting/train.csv"], stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
nRecords   = int(out.strip().split()[0])
print("Records in Train Data: {}".format(nRecords))

Now that we know number of records in train data, we can also generate a random sample from the data instead of reading the whole file.

In [None]:
import random
import pandas as pd 

skiprows = random.sample(range(0, 24236807), 20000000)
skiprows.remove(0)
train_data_sub = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/train.csv', skiprows = skiprows)

In [None]:
print('Size of supplemental Data Created Manually : {}'.format(train_data_sub.shape[0]))
print('No of Features in Train Data : {}'.format(train_data_sub.shape[1]))

We have created a sunset of train data which has around 4.2 M records(almost of double of the sample data provided here). Let's use this and explore the data.

In [None]:
pd.set_option('display.width', 130) # Helps us visualize more features in a singlew row
pd.set_option('display.float_format', lambda x: '%.3f' % x) # Print values only upto 3 Decimal Digits

**Basic Stats** : Focus on what is already available with pandas

> Top 5 Records

In [None]:
train_data_sub.head()

Features Definition:
* timestamp - A timestamp for the minute covered by the row.
* Asset_ID - An ID code for the cryptoasset.
* Count - The number of trades that took place this minute.
* Open - The USD price at the beginning of the minute.
* High - The highest USD price during the minute.
* Low - The lowest USD price during the minute.
* Close - The USD price at the end of the minute.
* Volume - The number of cryptoasset units traded during the minute.
* VWAP - The volume weighted average price for the minute.
* Target - 15 minute residualized returns.

> Bottom 5 Records

In [None]:
train_data_sub.tail()

> Use combination of head and tail to observe some in B/W observation

In [None]:
print('Check 5 records from index 1000 to 1004 \n')
print(train_data_sub.head(1005).tail(5))

**We see that the view when using print is different from the default view.**

> Use sample method in pandas to observe random obesrvation
> This is very useful when we are visualizing data after some data analysis

In [None]:
print(train_data_sub.sample(5))

> Data Info

In [None]:
train_data_sub.info()

We see that timestamp and Asset_ID has been read as int64. Asset_ID should be read as a string var.

**Data Describe**

In [None]:
percentile_list = [0.01, 0.05]
percentile_list.extend([i/10 for i in range(1, 10, 1)]) # Notice that we used extend instead of append 
percentile_list.extend([0.95, 0.99])

print(train_data_sub.describe(percentiles = percentile_list, include = 'all').T)

- Target      : Has some missing values(131K) - Observe the count to find this
- Count       : 286 trades take place per minute(Max : 165K - 47X of 99th Percentile Value)
- Open        : 50% of the times, trades open below $15(65K Max Traded Value in a Minute)
- High, Low, and Close  : Almost same distribution as of open price in a minute(Not much variation in price at minute level)
- Volume      : 289K average volume per minute. 50% of the time per minute trade volume is less than 1300. Highest volume in a minute is 596M(100X of volume at 99th percentile)
- VWAP        : Volume weighted average price is 1431.(Max : 65K)
- Target      : 99% of the time, residualized return is less than 0.02. 50% of the time, the return is in -ve

> Unique Values in each feature

In [None]:
print(train_data_sub.nunique())

In [None]:
print('Null Values \n')
print(train_data_sub.isnull().sum())

**Data Analysis**

We have already observed that Open, Low, High, and Close distributions are almost identical. Let's validate this by using **correlation plot** B/W these features. 

In [None]:
import matplotlib.pyplot as plt

corr = train_data_sub[['Open', 'Low', 'High', 'Close']].corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
fig, axs = plt.subplots(2,3, figsize = (14, 8))
fig.suptitle('Correlation Plots')

axs[0,0].plot(train_data_sub['Open'], train_data_sub['Low'])
axs[0,0].set_title('Open vs Low')

axs[0,1].plot(train_data_sub['Open'], train_data_sub['High'])
axs[0,1].set_title('Open vs High')

axs[0,2].plot(train_data_sub['Open'], train_data_sub['Close'])
axs[0,2].set_title('Open vs Close')

axs[1,0].plot(train_data_sub['Low'], train_data_sub['High'])
axs[1,0].set_title('Low vs High')

axs[1,1].plot(train_data_sub['Low'], train_data_sub['Close'])
axs[1,1].set_title('Low vs Close')

axs[1,2].set_title('High vs Close')
axs[1,2].plot(train_data_sub['High'], train_data_sub['Close'])

It is obvious that any one of the feature has sufficient information and can cover the nature of other three for all purposes.

**Box Plot for Numeric Features**

Though we have already looked at the precentile distribution of each feature, another way of achieving the same thing is using Box Plots, and ofcourse it is a visual represenation so much easy to understand.

In [None]:
import seaborn as sns
sns.set_style("darkgrid")

fig, axs = plt.subplots(2, 2, figsize= (16, 10))

sns.boxplot(x = train_data_sub['Count'], ax = axs[0,0])
sns.boxplot(x = train_data_sub['Close'], ax = axs[0,1])
sns.boxplot(x = train_data_sub['Volume'],ax = axs[1,0])
sns.boxplot(x = train_data_sub['VWAP'],  ax = axs[1,1])

There is a lot of values beyond upper whisker(Q1 + 1.5 * IQR) : **Outliers**

IQR : Q3 - Q1

**Exploratory Data Analysis Using Existing Packages**
* What we have done so far, is to analyze the data through existing methods in pandas
* Now we will use existing packages 
> * My personal favourite is dataprep
> Documentation : https://dataprep.ai/
* Other packages : AutoViz, PandasProfiling, Lux

In [None]:
!pip install dataprep

In [None]:
from dataprep.eda import plot, plot_correlation, create_report, plot_missing

In [None]:
plot(train_data_sub)

* While analyzing the data, we missed the distribution similarity of VWAP with Low, High, Open, and Close  
* Almost all features are skewed
* Asset_ID 1, 2, 5, 6, 7, 9 have same volume

In [None]:
import warnings
warnings.filterwarnings('ignore')
create_report(train_data_sub)

In [None]:
plot(train_data_sub, 'Volume')

**Assets Details Data**

In [None]:
asset_details_data = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv')

In [None]:
asset_details_data.head()

In [None]:
create_report(asset_details_data)

**Add Asset Name to Feature Data**

In [None]:
train_data_sub = train_data_sub.merge(asset_details_data, on = 'Asset_ID')

In [None]:
train_data_sub.head()

* Distribution of weight by asset name

In [None]:
def plot_barh(df, xvar, yvar):
    fig = plt.figure(figsize = (10, 5))
    plt.xticks(rotation='vertical')
    plt.bar(df[xvar], df[yvar])

In [None]:
print(asset_details_data.sort_values('Weight', ascending = False))
plot_barh(asset_details_data, 'Asset_Name', 'Weight')

**Top 5 currencies by Weight:**
* Bitcoin
* Ethereum
* Cardano
* Binance Coin
* Dogecoin


In [None]:
train_data_sub['Asset_Name'].value_counts(normalize = True)

**Top 5 currency by # recods:**
* Ethereum
* Litecoin
* EOS.IO
* Bitcoin
* Bitcoin Cash

> All the top 5 coins have 8.1% of volume

**Total Traded Volume of Each Cypto**

> In the below code section, we will also see an example of operator chaining in pandas

In [None]:
volume_traded_by_asset = (train_data_sub.groupby('Asset_Name')['Volume']
                                        .agg(['sum', 'mean'])
                                        .reset_index()
                                        .rename(columns = {'sum' : 'TradedVolume', 'mean' : 'AvgVolTraded'})
                                        .sort_values('AvgVolTraded', ascending = False))
print(volume_traded_by_asset)
plot_barh(volume_traded_by_asset, 'Asset_Name', 'TradedVolume')
plot_barh(volume_traded_by_asset, 'Asset_Name', 'AvgVolTraded')

**Derived Features**

We can also create some hypothesis about the data and validate them by creating features. These features will help us get lift at modeling phase.
Features:
1. Units Sold per Minute - Volume/Count
2. 


**Units Sold per Minute**

In [None]:
train_data_sub['UnitsSoldPerMinute'] = train_data_sub['Volume']/train_data_sub['Count']

WIP....