In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://www.cointribune.com/wp-content/uploads/2021/06/to-the-moon-and-bitcoin-stock-growth-concept-strong-increase-of-bitcoin-prices-shown-at-candlestick-bull-market-chart-lightning-hitting-the-bitcoin-investment-in-cryptocurrency-world-stockpack-deposit-photos-scaled.jpg)

## Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import plotly
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor

In [None]:
def reduce_memory_usage(df):
    """ 
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
#     start_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))    

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## Files
train.csv - The training set

> timestamp - A timestamp for the minute covered by the row.

> Asset_ID - An ID code for the cryptoasset.

> Count - The number of trades that took place this minute.

> Open - The USD price at the beginning of the minute.

> High - The highest USD price during the minute.

> Low - The lowest USD price during the minute.

> Close - The USD price at the end of the minute.

> Volume - The number of cryptoasset units traded during the minute.

> VWAP - The volume weighted average price for the minute.

> Target - 15 minute residualized returns. See the 'Prediction and Evaluation' section of this notebook for details of how the target is calculated.

example_test.csv - An example of the data that will be delivered by the time series API.

example_sample_submission.csv - An example of the data that will be delivered by the time series API. The data is just copied from train.csv.

asset_details.csv - Provides the real name and of the cryptoasset for each Asset_ID and the weight each cryptoasset receives in the metric.

gresearch_crypto - An unoptimized version of the time series API files for offline work. You may need Python 3.7 and a Linux environment to run it without errors.

supplemental_train.csv - After the submission period is over this file's data will be replaced with cryptoasset prices from the submission period. In the Evaluation phase, the train, train supplement, and test set will be contiguous in time, apart from any missing data. The current copy, which is just filled approximately the right amount of data from train.csv is provided as a placeholder.

In [None]:
data = pd.read_csv('../input/g-research-crypto-forecasting/train.csv')
data.head()

In [None]:
asset_id = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
asset = asset_id.set_index('Asset_ID').sort_index()
asset

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data['Asset_ID'], palette='cool')

### Let's have a look at the most famous crypto - BITCOIN

In [None]:
btc = data[data['Asset_ID'] == 1].set_index("timestamp")
btc_range = btc.iloc[-500:]  # Since we cannot plot the entire btc, so selecting the latest 500 points.
btc_range.head()

In [None]:
fig = go.Figure(data=[go.Candlestick(x=btc_range.index, 
                                     open=btc_range['Open'], 
                                     high=btc_range['High'], 
                                     low=btc_range['Low'], 
                                     close=btc_range['Close'])])
print('Use the slider to Adjust and Zoom')
fig.show()

### Let's have a look at another crypto - ETHEREUM

In [None]:
eth = data[data['Asset_ID'] == 1].set_index("timestamp")
eth_range = eth.iloc[-500:]  # Since we cannot plot the entire btc, so selecting the latest 500 points.


fig = go.Figure(data=[go.Candlestick(x=eth_range.index, 
                                     open=eth_range['Open'], 
                                     high=eth_range['High'], 
                                     low=eth_range['Low'], 
                                     close=eth_range['Close'])])
fig.show()

### Data Preprocessing

In [None]:
data.isnull().sum()

Many missing values in target variable.

In [None]:
btc.isnull().sum()

Missing asset data, for a given minute, is not represented by NaN's, but instead by the absence of those rows. We can check the timestamp difference between consecutive rows to see if there is missing data.

In [None]:
(btc.index[1:]-btc.index[:-1]).value_counts().head()

Notice that there are many gaps in the data. 

Let's now see Ethereum.

In [None]:
eth.isnull().sum()

In [None]:
(eth.index[1:]-eth.index[:-1]).value_counts().head()

As expected it also has large gaps.

To work with most time series models, we should preprocess our data into a format without time gaps. 

To fill the gaps, we can use the `.reindex()` method for forward filling, filling gaps with the previous valid value.

In [None]:
btc = btc.reindex(range(btc.index[0],btc.index[-1]+60,60),method='pad')
eth = eth.reindex(range(eth.index[0],eth.index[-1]+60,60),method='pad')

In [None]:
(btc.index[1:]-btc.index[:-1]).value_counts().head()
(eth.index[1:]-eth.index[:-1]).value_counts().head()

Clearly, their is no time gap now.

### Feature Enginnering

In [None]:
def upper_shadow(df):
    return df["High"] - np.maximum(df["Close"], df["Open"])

def lower_shadow(df):
    return np.minimum(df["Close"], df["Open"]) - df["Low"]

In [None]:
def get_features(dataframe, row=False):
    df_feat = dataframe.copy()
    
    df_feat["Upper_Shadow"] = upper_shadow(df_feat)
    df_feat["Lower_Shadow"] = lower_shadow(df_feat)

    ## Adding some more features
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    
    if row:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean()
    else:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)

    df_feat["High/Mean"] = df_feat["High"] / df_feat["Mean"]
    df_feat["Low/Mean"] = df_feat["Low"] / df_feat["Mean"]
    df_feat["Volume/Count"] = df_feat["Volume"] / (df_feat["Count"] + 1)

    times = pd.to_datetime(df_feat["timestamp"],unit="s",infer_datetime_format=True)
    if row:
        df_feat["hour"] = times.hour  # .dt
        df_feat["dayofweek"] = times.dayofweek 
        df_feat["day"] = times.day 
    else:
        df_feat["hour"] = times.dt.hour  # .dt
        df_feat["dayofweek"] = times.dt.dayofweek 
        df_feat["day"] = times.dt.day 
    
    if row:
        df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median()
    else:
        df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)
    df_feat["High/Median"] = df_feat["High"] / df_feat["Median"]
    df_feat["Low/Median"] = df_feat["Low"] / df_feat["Median"]

    for col in ['Open', 'High', 'Low', 'Close', 'VWAP']:
        df_feat[f"Log_1p_{col}"] = np.log1p(df_feat[col])
        
    if row:
        df_feat.drop(['row_id', 'Asset_ID'], inplace=True)
    else:
        df_feat.drop(['Asset_ID'], axis=1, inplace=True)
    
    return df_feat

## Model Building

In [None]:
def model_building(df_data, asset_id):
    data_set = df_data[df_data['Asset_ID'] == asset_id]
    df = get_features(data_set)
    df = df.replace([np.inf, -np.inf], np.nan).dropna(how="any")
    
    df = reduce_memory_usage(df)
    
    X = df.drop(['Target'], axis=1)
    y = df["Target"]
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=24, shuffle=False)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    
    model = LGBMRegressor(
        n_estimators=1500, 
        num_leaves=700,
        objective="regression",
        metric="rmse",
        boosting_type="gbdt",
        learning_rate=0.01,
        random_state=24,
        verbose=0,
        force_col_wise=True,
    )
    
    model.fit(X_train, y_train)
        
    return X_train, y_train, model

In [None]:
Xs = {}
ys = {}
models = {}
print('Training Starting...')

for asset, asset_name in zip(asset_id["Asset_ID"], asset_id["Asset_Name"]):
    X, y, model = model_building(data, asset)
    Xs[asset], ys[asset], models[asset] = X, y, model
    
print('Training Completed !!!')

## Prediction

In [None]:
import gresearch_crypto

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        model = models[row["Asset_ID"]]
        x_test = get_features(row, row=True)
        y_pred = model.predict([x_test])[0]
        
        df_pred.loc[df_pred["row_id"] == row["row_id"], "Target"] = y_pred

    # Send submissions
    env.predict(df_pred)