<!-- ![../input/gresearchpic/gresearch.png](attachment:8f9e16fc-c54c-4f53-8a06-5e816ea1b10b.png)  

<p style="text-align:center;"><span style="font-size:80px;"><span style="color:orange;"> <i>Crypto Forecasting</i> </span></span></p> -->
  
<span style="font-size:18px;"><span style="font-family:cursive;">
  <b>Author: Vincent Weimer <br>
      Date: 2022-01-22</b>
    </span>
  
<hr></hr> 

In [None]:
%%capture

## Import Libraries
import numpy as np # linear algebra
from numpy.random import seed 
import math 
from math import sqrt, log

from scipy.stats import normaltest

import pandas as pd # data processing 
pd.options.display.max_rows = 100
pd.options.display.max_seq_items = 2000
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mpl_dates
plt.rcParams.update({'font.size': 14})
import seaborn as sns
plt.style.use('seaborn')
sns.set_style('whitegrid')

# !pip install talib-binary # install talib for feature engineering 
# import talib
# from talib import RSI, BBANDS, MACD, ATR

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline

import warnings # Supress warnings 
warnings.filterwarnings('ignore')

# import statsmodels as sm

# import joblib

# Fix seed for reproducible results
SEED = 42
np.random.seed(SEED)

In [None]:
## Useful Functions

def read_dataset(folder, name, types=None):
    return pd.read_csv(folder + name + ".csv", dtype=types)

def resample_timeseries(df, offset='H'):
    """Resampes a timeseries dataframe given the offset forwarded in the function"""
    
    df = df.reset_index().groupby(['Asset_ID', 'Ticker']).resample(offset, on='timestamp', origin='start').agg(
        {"Open": "first", 
         "Close": "last", 
         "Low": "min", 
         "High": "max",
         "Volume": "sum",
         "VWAP": "max"
        }
    ).dropna()[['Open', 'High', 'Low', 'Close', 'Volume','VWAP']]

    return df.reset_index().set_index('timestamp')

def generate_crypto_dfs(df):
    """Returns a dictionary of dataframes with key(Ticker):value(Dataframe that belongs to that ticker)"""
    
    return {ticker:df.query("Ticker == @ticker") for ticker in tickers}

def find_missing_rows(df, start, end, asset):
    """This function calculates how many missing rows each crypto asset has."""
    
    num_rows = len(df.index)
    num_missing_rows = len(pd.date_range(start=start, end=end, freq='min').difference(df.index))
    print(f"{asset} total number of records: " + BOLD + f"{num_rows}" + END +  \
          ", Total number of missing minute records: " + BOLD + f"{num_missing_rows}" + END)
    
def populate_missing_rows(df):
    """This function populates missing minute data rows with Null values."""
    
    asset, start_date, end_date = df.iloc[0]['Asset_Name'], df.index[0], df.index[-1]
    print(f"{asset} start date: {start_date}, end date: {end_date}")
    find_missing_rows(df, start_date, end_date, asset)
    
    df = df.asfreq(freq='T', method='ffill')  # forward fill missing time series data
    print(f"Populating missing {asset} data...")
    find_missing_rows(df, start_date, end_date, asset)
    print('\n')
    
    return df

def close_close_volatility(df,window_size, N=365): # N=365 because crypto markets don't close 
    # Compute log returns using close prices 
    df['Log_Returns'] = np.log(df['Close'] / df['Close'].shift(1))
    
    # Compute historical volatility
    return df.Log_Returns.rolling(window_size).std() * np.sqrt(N)

def calc_daily_features(df):
    
    df['Date'] = pd.to_datetime(df.index.date)
    df['Date'] = df['Date'].dt.strftime('%Y%m%d').astype(int)
    
    # Momentum 
    df['Rsi_14d'] = RSI(df.Close, timeperiod=14)
    df['Bbands_upper'], df['Bbands_middle'], df['Bbands_lower'] = BBANDS(df.Close, timeperiod=10, nbdevup=2, nbdevdn=2)
    
    # SMA
    df['Sma_10d'] = df.Close.rolling(window=10).mean()
    df['Sma_20d'] = df.Close.rolling(window=20).mean()
    df['Sma_30d'] = df.Close.rolling(window=30).mean()
    
    # Volume moving average
    df['Avg_volume_10d'] = df['Volume'].rolling(window=10).mean()
    df['Avg_volume_20d'] = df['Volume'].rolling(window=20).mean()
    df['Avg_volume_30d'] = df['Volume'].rolling(window=30).mean()

    # Volatility
    df['Close_close_vol_10d'] = close_close_volatility(df,10)
    df['Close_close_vol_20d'] = close_close_volatility(df,20)
    df['Close_close_vol_30d'] = close_close_volatility(df,30)
    df['Atr_14d'] = ATR(df.High, df.Low, df.Close, timeperiod=14)
    
    return df.iloc[:, 8:] 

def compute_feature_importance(model):
    fi_df = pd.DataFrame()
    fi_df['features'] = features
    f_df = fi_df[:-1]
    fi_df['importance'] = model.booster_.feature_importance(importance_type="gain")
    
    # plot feature importance
    fig, ax = plt.subplots(1, 1, figsize=(7, 15))
    sns.barplot(
    x='importance', 
    y='features',
    data=fi_df.sort_values(by=['importance'], ascending=False),
    ax=ax)

In [None]:
%%time 

def reduce_mem_usage(df): ## Copied directly from other Kagglers! ##
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#         else:
#             df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# text formatting
text_formats = {
   'PURPLE': '\033[95m',
   'CYAN': '\033[96m',
   'DARKCYAN': '\033[36m',
   'BLUE': '\033[94m',
   'GREEN': '\033[92m',
   'YELLOW': '\033[93m',
   'RED': '\033[91m',
   'BOLD': '\033[1m',
   'UNDERLINE': '\033[4m',
   'END': '\033[0m'
}

RED = text_formats['RED']
BOLD = text_formats['BOLD']
END = text_formats['END']

In [None]:
%%time 
## Import data sets

# folder
data_folder = "../input/g-research-crypto-forecasting/"
!ls $data_folder

# tickers 
tickers = ['BNB', 'BTC', 'BCH', 'ADA', 'DOGE', 'EOS', 'ETH', 'ETC', 'IOTA', 'LTC', 'MKR', 'XMR', 'XLM', 'TRX']

# data types
dtypes = {
    'timestamp': str,
    'Asset_ID': np.int8,
    'Count': np.int32,
    'Open': np.float64,
    'High': np.float64,
    'Low': np.float64,
    'Close': np.float64,
    'Volume': np.float64,
    'VWAP': np.float64,
    'Target': np.float64,
}

# data sets 
# example submission
example_submission_df = read_dataset(data_folder, "example_sample_submission")

# test df
test_df = read_dataset(data_folder, "example_test")
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], unit='s') 

# asset details 
assets_df = read_dataset(data_folder, "asset_details")
assets_df = assets_df.sort_values(by='Asset_ID')

mapping_name = dict(assets_df[['Asset_ID', 'Asset_Name']].values)
mapping_weight = dict(assets_df[['Asset_ID', 'Weight']].values)
mapping_tickers = dict(enumerate(tickers))
assets_df['Ticker'] = assets_df["Asset_ID"].map(mapping_tickers)

# train df
train_df_raw = read_dataset(data_folder, "train", dtypes)

print(f"Shape example submission file: {example_submission_df.shape}")
print(f"Shape test data file: {test_df.shape}")
print(f"Shape asset details file: {assets_df.shape}")
print(f"Shape train data file: {train_df_raw.shape}")

train_df_raw.head()

In [None]:
# assets_df.head()

In [None]:
# test_df.head()

<b><i>Important!! Test data starts at 2021-06-13 and training data ends at 2021-09-21. To prevent data leakage, we need to filter train_df before the test data starts.</i></b>

## Prepare first train data
Taking in the raw train data, we wrangle the data in a first step perform the following steps:
* Transform unix timestamp to a regular timestamp
* Extract simple date values like date, year, and month
* Add colomns from asset details including the ticket, which makes the Data Frames more easy to filter
* Rename count to interval_trade_count

In [None]:
# %%time

# def prepare_train_data(df):
#     """This function takes in the raw training data and wrangles the data in the shape and types 
#     required for further research"""
    
#     CUTOFF_DATE = '2021-06-13 00:00:00'
    
#     df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')       # convert from unix to datetime timestamp
#     df = df.set_index('timestamp')                                    # set date as index column
    
#     dates = df.index.get_level_values('timestamp')
#     df['Date'] = pd.to_datetime(dates.date)
#     df['Date'] = df['Date'].dt.strftime('%Y%m%d').astype(int)         # SLOWS IT DOWN BY 3 MINS! NEED STH BETTER
#     df['Year'] = dates.year
#     df['Month'] = dates.month
    
#     df["Asset_Name"] = df["Asset_ID"].map(mapping_name)
#     df["Asset_Weight"] = df["Asset_ID"].map(mapping_weight)
#     df["Ticker"] = df["Asset_ID"].map(mapping_tickers)
    
#     df.sort_index()
#     df = df[(df.index < CUTOFF_DATE)]
    
#     total_num_records = df.shape[0]
#     print(f"Total number of records in train dataset: {total_num_records}")
    
#     return df

# df_train = prepare_train_data(train_df_raw)
# print(df_train.dtypes)
# df_train.head()

## Populate missing time series rows 

In [None]:
# %%time
# crypto_dfs = list()
    
# for i in list(range(14)):
#     symbol_df = df_train[df_train['Asset_ID'] == i]
#     crypto_dfs.append(populate_missing_rows(symbol_df))

# df_train = pd.concat(crypto_dfs).sort_index() 
# df_train.head()

In [None]:
# %%time
# # all_cryptos = generate_crypto_dfs(df_train)
# btc = df_train.query("Ticker == 'BTC'")
# btc.head()

In [None]:
# print(len(df_train.index))

In [None]:
%%time

### Import preprepped train data
folder_train_prepped = "../input/g-research/"
!ls $data_folder_prepped
df_train_prepped = pd.read_csv(folder_train_prepped + "df_train_prep.csv").set_index('timestamp') 
df_train_prepped.head()

In [None]:
print(df_train_prepped.dtypes)

In [None]:
# for ticker in tickers:
#     ticker_df = df_train_prepped.query("Ticker == @ticker")
#     print(f"{ticker} contains how many inf VWAP values: {np.isinf(ticker_df.VWAP).values.sum()}")
#     ticker_df['VWAP']= ticker_df['VWAP'].replace(np.inf, ticker_df.VWAP.mean())
#     print(f"{ticker} contains how many inf VWAP values: {np.isinf(ticker_df.VWAP).values.sum()}")

# Feature Engineering

Features considered:

* Price transformation features
* RSI
* Historical Volatility
* Momentum features
* Rolling Average Volume

In [None]:
features_submission = ['VWAP', 'Avg_price', 'Weighted_close', 'Typical_Price', 'Median_price', \
                      'Dollar_volume', 'Volume_per_trade', 'Dollar_volume_per_trade', 'Upper_shadow', 'Lower_shadow']

def calc_features_submission(df):
    
       # Price transformation features
    df['Avg_price'] = (df['Close'] + df['Open'] + df['Low'] + df['High']) / 4
    df['Weighted_close'] = ((df['Close'] * 2) + df['High'] + df['Low']) / 4        # extra weight on the close pri
    df['Typical_price'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['Median_price'] = (df['High'] + df['Low']) / 2
    
    df['Dollar_volume'] = df['Close'] * df['Volume']
    df['Volume_per_trade'] = df['Volume'] / df['Count']
    df['Dollar_volume_per_trade'] = df['Dollar_volume'] / df['Count']
    
    df['Upper_shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['Lower_shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    
#     df_feat = pd.DataFrame(df, columns=features_submission)
    
    return df

In [None]:
def calc_features(df, daily=True):
    
#     df = reduce_mem_usage(df)
    
    CLOSE = df.Close
    DF_DAILY = resample_timeseries(df, 'D')
    
       # Price transformation features
    df['Avg_price'] = (df['Close'] + df['Open'] + df['Low'] + df['High']) / 4
    df['Weighted_close'] = ((df['Close'] * 2) + df['High'] + df['Low']) / 4        # extra weight on the close pri
    df['Typical_price'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['Median_price'] = (df['High'] + df['Low']) / 2
    
    df['Dollar_volume'] = df['Close'].mul(df['Volume'])
    df['Volume_per_trade'] = df['Volume'].div(df['Count'])
    df['Dollar_volume_per_trade'] = df['Dollar_volume'].div(df['Count'])
    
    df_features_daily = calc_daily_features(DF_DAILY) 

    # Join daily features back to original df which contains minute data
    if daily:
        return df_features_daily
    else:
        df['timestamp'] = df.index
        df = df.merge(df_features_daily, on='Date', how='left').set_index(df['timestamp']).drop(columns='timestamp')
        df = df.dropna(subset=['Close_close_vol_30d'])
        return df 

In [None]:
# btc_feat = calc_features(btc, False) 
# btc_feat.head()

# Modeling

In [None]:
%%time
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

features = ['vwap', 'weighted_close', 'avg_price', 'median_price', 'typical_price', 'sma_10d', 'sma_20d', 'rsi_14d', 'atr_14d', 'close_close_vol_10d', \
       'close_close_vol_20d'] 

# parameters
params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'learning_rate': 0.01
        }

def calc_Xy_and_model_for_ticker(df):
    
    name = df.iloc[0]['Asset_Name']
    df.replace([np.inf, -np.inf], np.nan,inplace=True) # Replace inf values (Maker has a few inf VWAP values)
    
    # TODO: Try different features here!
    df_feat = calc_features_submission(df)
#     df_feat = reduce_mem_usage(df_feat)  # reduce mem usage
    df_feat['y'] = df['Target']
    df_feat = df_feat.dropna(how="any") # drop rows with null target rows 
    
    X = pd.DataFrame(df_feat, columns=features_submission)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    y = df_feat["y"]
    
    print(f"Shape {name} X_scaled: {X_scaled.shape}")
    print(f"Shape {name} y: {y.shape}")
    
    # TODO: Try different models here!
    model = lgb.LGBMRegressor(**params, n_estimators=10)
    print(f"Building model for {name}..")
    model.fit(X_scaled, y)
    
#     compute_feature_importance(model)
    
    return X, y, model

In [None]:
Xs, ys, models = {}, {}, {}

for asset_id, ticker in zip(assets_df['Asset_ID'], assets_df['Ticker']):
    
    asset_df = df_train_prepped.query("Ticker == @ticker")
    X, y, model = calc_Xy_and_model_for_ticker(asset_df)    
    Xs[ticker], ys[ticker], models[ticker] = X, y, model

In [None]:
# btc = df_train.query("Ticker == 'BTC'")
# btc_ticker = btc.iloc[0]['Ticker']

print(models.keys())

In [None]:
import gresearch_crypto

env = gresearch_crypto.make_env()
iter_test = env.iter_test()


# Submission

In [None]:

for i, (df_test, df_pred) in enumerate(iter_test):
    
    df_test['Ticker'] = df_test["Asset_ID"].map(mapping_tickers)
    
    for j , row in df_test.iterrows():
        
        if models[row['Ticker']] is not None:
            try:
                model = models[row['Ticker']]
                x_test = calc_features_submission(row)
                x_test = pd.DataFrame([x_test], columns=features_submission)
                print(x_test)
                y_pred = model.predict(x_test)[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
        
    env.predict(df_pred)