## Data
We are given the training file `train.csv` with the following columns 

* timestamp - A timestamp for the minute covered by the row.
* Asset_ID - An ID code for the cryptoasset.
* Count - The number of trades that took place this minute.
* Open - The USD price at the beginning of the minute.
* High - The highest USD price during the minute.
* Low - The lowest USD price during the minute.
* Close - The USD price at the end of the minute.
* Volume - The number of cryptoasset units traded during the minute.
* VWAP - The volume weighted average price for the minute.
* Target - 15 minute residualized returns. 

In [None]:


import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from sklearn.model_selection import GridSearchCV
import traceback
import datatable as dt
import gresearch_crypto
from lightgbm import LGBMRegressor


import datetime
import seaborn as sns
cmap = sns.color_palette()

In [None]:
data_folder = "../input/g-research-crypto-forecasting/"


In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
df_train = pd.read_csv(data_folder + 'train.csv')


In [None]:
print(f"There are {df_train.shape[0]} rows in the given dataset")
df_train.head()

In [None]:
# df_train["date_time"] = df_train.timestamp.apply(lambda x: datetime.datetime.fromtimestamp(x))

In [None]:
df_asset_details = pd.read_csv(data_folder + 'asset_details.csv')
df_asset_details

In [None]:
df_asset_details["weight_percentage"] = (df_asset_details["Weight"] / df_asset_details["Weight"].sum()) * 100
df_asset_details.sort_values("Weight", ascending=False)

In [None]:
fig = px.pie(df_asset_details, values='weight_percentage', names='Asset_Name', title='Weights given to each cryptocurrency')
fig.show()

### Start & End Time
Let us now look at the start and end time of each of the given crypto currencies.

In [None]:
asset_names_dict = {row["Asset_Name"]:row["Asset_ID"] for ind, row in df_asset_details.iterrows()}

asset_names = [
    'Bitcoin',
    'Ethereum',
    'Cardano',
    'Binance Coin',
    'Dogecoin',
    'Bitcoin Cash',
    'Litecoin',
    'Ethereum Classic',
    'Stellar',
    'TRON',
    'Monero',
    'EOS.IO',
    'IOTA',
    'Maker'
]

time_list = []
for coin in asset_names:
    coin_df = df_train[df_train["Asset_ID"]==asset_names_dict[coin]].set_index("timestamp")
    start_time = coin_df.index[0].astype('datetime64[s]')
    end_time = coin_df.index[-1].astype('datetime64[s]')
    time_list.append([coin, start_time, end_time])
time_df = pd.DataFrame(time_list)
time_df.columns = ["Asset_Name", "Start_Time", "End_Time"]
time_df

**Inference:**
* The earliest start date in the data is Jan 1, 2018 and most coins have that start datetime.
* The data is available till Sep 21, 2021 and all the coins have the same end datetime.
* Dogecoin has the least historical information available of the given coins and is captured onlt from April 2019. 

### Missing data

Now let us check the missing data in each of these assets / coins.

In [None]:
df_train.isnull().sum().sort_values(ascending = False)

In [None]:
missing_list = []
for coin in asset_names:
    coin_df = df_train[df_train["Asset_ID"]==asset_names_dict[coin]].set_index("timestamp")
    missing_list.append([coin, coin_df.shape[0]] + coin_df.isna().sum().tolist())
missing_df = pd.DataFrame(missing_list)

missing_df

In [None]:
missing_df.columns = ["Asset_Name", "TotalRows", 'Missing_Asset_ID', 'Missing_Count', 'Missing_Open', 
                      'Missing_High', 'Missing_Low', 'Missing_Close', 'Missing_Volume', 'Missing_VWAP', 'Missing_Target']

# Extracting time features

In [None]:


# df_train['year'] = pd.DatetimeIndex(df_train['date_time']).year
# df_train['month'] = pd.DatetimeIndex(df_train['date_time']).month
# df_train['month_day'] = pd.DatetimeIndex(df_train['date_time']).day
# df_train['weekday'] = pd.DatetimeIndex(df_train['date_time']).weekday   # Monday is 0 and Sunday is 6



In [None]:
def hlco_ratio(df): return (df['High'] - df['Low'])/(df['Close']-df['Open'])
def upper_shadow(df): return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df): return np.minimum(df['Close'], df['Open']) - df['Low']

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    df_feat['hlco_ration'] = hlco_ratio(df_feat)
    return df_feat


In [None]:
df_train.head()

In [None]:
def get_xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    # TODO: Try different features here!
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")
    
    x = df_proc.drop("y", axis=1)
    y = df_proc["y"]    
    model = LGBMRegressor(device = 'gpu')
    model.fit(x, y)
    return x, y, model



In [None]:
# df_train.drop(['timestamp'], axis=1, inplace=True)

In [None]:
xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    x, y, model = get_xy_and_model_for_asset(df_train, asset_id)    
    xs[asset_id], ys[asset_id], models[asset_id] = x, y, model
#     try:
#         x, y, model = get_xy_and_model_for_asset(df_train, asset_id)    
#         xs[asset_id], ys[asset_id], models[asset_id] = x, y, model
#     except:         
#         xs[asset_id], ys[asset_id], models[asset_id] = None, None, None 

In [None]:
# Check the model interface
x_ = get_features(df_train.iloc[1])
y_predict = models[0].predict(pd.DataFrame([x_]))
y_predict[0]

In [None]:
del df_train # to release memory

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():        
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(row)
                y_pred = model.predict(pd.DataFrame([x_test]))[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0        
    env.predict(df_pred)