In [None]:
# Necessary Imports

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# File/data manipulation
import gc
import pathlib
from tqdm.auto import tqdm
import joblib
import pathlib
import json
import glob
import time
import datetime
from scipy import stats
from multiprocessing import Pool, cpu_count

# Visualization
import matplotlib.pyplot as plt
import matplotlib.style as style
from matplotlib_venn import venn2, venn3
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')
import plotly.express as px
import plotly.graph_objects as go

# Model
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.simplefilter('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Filepaths
INPUT_DIR = '/kaggle/input/g-research-crypto-forecasting/'
OUTPUT_DIR = './'

In [None]:
# Function to reduce memory usage
# Thanks fellow Kaggle User
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#         else:
#             df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# Get training(and testing) data

#train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv')).pipe(reduce_mem_usage)
print(train.shape)
train.head()

In [None]:
# Get cryptoasset details (Real Name and Weight)
asset_details = pd.read_csv(os.path.join(INPUT_DIR, 'asset_details.csv'))
asset_details['Asset_ID'] = asset_details['Asset_ID'].astype(np.int8)
print(asset_details.shape)
asset_details

In [None]:
# Look and G-Research's example_sample_submission
example_sample_submission = pd.read_csv(os.path.join(INPUT_DIR, 'example_sample_submission.csv'))
print(example_sample_submission.shape)
example_sample_submission.head()

In [None]:
# Get "test" data. Note: just an example of the data that will be delivered by G-Research' API to test model.
# %%time

#test_df = pd.read_csv(os.path.join(INPUT_DIR, 'example_test.csv'))
test_df = pd.read_csv(os.path.join(INPUT_DIR, 'example_test.csv')).pipe(reduce_mem_usage)
print(test_df.shape)
test_df.head()

# Look at some of the data

In [None]:
# dataframe info
train.info()

In [None]:
# missing values?
train.isna().sum()

In [None]:
print("Total Null Target Rows = " ,train["Target"].isnull().sum())
print("Percentage of NUll rows in Training Data = {:.2f}%".format(train["Target"].isnull().sum()*100 / train.shape[0] ))

In [None]:
example_sample_submission.info()

In [None]:
asset_count= []
for i in range(14):
    count = (train["Asset_ID"]==i).sum()
    asset_count.append(count)
fig = px.bar(x = asset_details.sort_values("Asset_ID")["Asset_Name"],
             y = asset_count , 
             color = asset_count ,
             color_continuous_scale="Emrld") 
fig.update_xaxes(title="Assets")
fig.update_yaxes(title = "Number of Rows")
fig.update_layout(showlegend = True,
    title = {
        'text': 'Data Distribution ',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
'''fig, ax = plt.subplots(3, 5, figsize=(20, 12), sharex=True)
ax = ax.flatten()
for i, asset in enumerate(train['Asset_ID'].unique()):
    train.query('Asset_ID == @asset')['Target'].hist(bins=30, color='k', alpha=0.7, ax=ax[i])
    asset_name = asset_details.query('Asset_ID == @asset')['Asset_Name'].values[0]
    weight = asset_details.query('Asset_ID == @asset')['Weight'].values[0]
    ax[i].set_title(f'{asset_name}\n(weight={weight})')
    
ax[-1].axis('off')
plt.tight_layout()'''

In [None]:
# select train and validation period

# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%m/%d/%Y").timetuple()))

train_window = [totimestamp("01/01/2018"), totimestamp("06/12/2021")]
valid_window = [totimestamp("06/13/2021"), totimestamp("09/21/2021")]
#train_window = [totimestamp("01/01/2018"), totimestamp("09/21/2020")]
#valid_window = [totimestamp("09/22/2020"), totimestamp("09/21/2021")]

train = train.set_index("timestamp")
beg_ = train.index[0].astype('datetime64[s]')
end_ = train.index[-1].astype('datetime64[s]')
print('>> data goes from ', beg_, 'to ', end_, 'shape=', train.shape)

# drop rows without target
train.dropna(subset=['Target'], inplace=True)

# add train flag
train['train_flg'] = 1
train.loc[valid_window[0]:valid_window[1], 'train_flg'] = 0

In [None]:
def add_asset_details(train, asset_details):
    """Add asset details to train df
    """
    return train.merge(
        asset_details
        , how='left'
        , on='Asset_ID'
    )

# merge asset_details
train = add_asset_details(train, asset_details)

In [None]:
def get_row_feats(df):
    df['upper_shadow'] = df['High'] / df[['Close', 'Open']].max(axis=1)
    df['lower_shadow'] = df[['Close', 'Open']].min(axis=1) / df['Low']
    df['open2close'] = df['Close'] / df['Open']
    df['high2low'] = df['High'] / df['Low']
    mean_price = df[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    median_price = df[['Open', 'High', 'Low', 'Close']].median(axis=1)
    df['high2mean'] = df['High'] / mean_price
    df['low2mean'] = df['Low'] / mean_price
    df['high2median'] = df['High'] / median_price
    df['low2median'] = df['Low'] / median_price
    df['volume2count'] = df['Volume'] / (df['Count'] + 1)
    df["opensubclose"] = df["Open"] - df["Close"]
    return df

In [None]:
%%time

# feature engineering
feature_df = get_row_feats(train)

print(feature_df.shape)
feature_df.tail()

In [None]:
target = 'Target'
drops = ['timestamp', 'Asset_Name', 'Weight', 'train_flg', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']
features = [f for f in train.columns if f not in drops + [target]]
categoricals = ['Asset_ID']

print('{:,} features: {}'.format(len(features), features))

In [None]:
# train (full model)
model = LGBMRegressor(#n_estimators=101,
                      n_estimators=10000,
                      objective='regression',
                      metric='rmse',
                      boosting_type='gbdt',
                      max_depth=-1,
                      learning_rate=0.01,
                      subsample=0.72,
                      subsample_freq=4,
                      feature_fraction=0.4,
                      bagging_fraction=0.4,
                      lambda_l1=1,
                      lambda_l2=1,
                      seed=46,)

In [None]:
model.fit(feature_df.query('train_flg == 1')[features],
                    feature_df.query('train_flg == 1')[target].values,
                    eval_set=[(feature_df.query('train_flg == 0')[features],
                               feature_df.query('train_flg == 0')[target].values)],
                    verbose=-1,
                    early_stopping_rounds=100,
                    categorical_feature=categoricals,)

# save model
joblib.dump(model, os.path.join(OUTPUT_DIR, 'lgb_model_val.pkl'))
print('lgb model saved!')

# feature importance
fi_df = pd.DataFrame()
fi_df['features'] = features
fi_df['importance'] = model.booster_.feature_importance(importance_type="gain")

In [None]:
# Plot feature importance
fig, ax = plt.subplots(1, 1, figsize=(7, 15))
sns.barplot(
    x='importance'
    , y='features'
    , data=fi_df.sort_values(by=['importance'], ascending=False)
    , ax=ax
)

In [None]:
# https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/291845

def weighted_correlation(a, b, weights):

    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return corr

# Evaulating the model by computing weighted correlation
model = joblib.load(os.path.join(OUTPUT_DIR, 'lgb_model_val.pkl'))
val_df = train.query('train_flg == 0').copy()
val_df['Prediction'] = model.predict(val_df[features])
for asset in val_df['Asset_ID'].unique():
    tmp = val_df.query('Asset_ID == @asset')
    coin = tmp['Asset_Name'].values[0]
    corr = weighted_correlation(tmp['Prediction'], tmp['Target'], tmp['Weight'])
    print('')
    print('- {}: Validation Score (weighted correlation) = {:.4f}'.format(coin, corr))

corr = weighted_correlation(val_df['Prediction'], val_df['Target'], val_df['Weight'])
print('=> Overall Validation Score (weighted correlation) = {:.4f}'.format(corr))

In [None]:
import gresearch_crypto
env = gresearch_crypto.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    # feature engineering
    test_df = get_row_feats(test_df)
    
    # inference
    sample_prediction_df['Target'] = model.predict(test_df[features])  # make your predictions here
    
    # register your predictions
    env.predict(sample_prediction_df)