In [None]:
import pandas as pd
import numpy as np
import dateutil
import os
import time
import datetime
import xgboost as xgb
import time
import pickle

# 0. Functions

In [None]:
def todatetime(t):
    """
    Convert Unix timestamp to datetime.
    """
    return datetime.datetime.fromtimestamp(t)

def reindex_by_date(df):
    """
    Reindex Time Series. 
    """
    dates = range(df.index[0], df.index[-1]+60,60)
    return df.reindex(dates, method = 'pad')

def apply_ta_strategy(data, ta_strategy ):
    """ 
    Apply Technical Indicators Strategy
    """

    dfs = []
 
    for asset in data['Asset_ID'].unique():
        
        #print(" Calculating TA for asset ", asset)
        _df_temp = data[data['Asset_ID'] == asset].copy(deep = True)
        _df_temp.ta.strategy(ta_strategy)
        dfs.append(_df_temp)

    return pd.concat(dfs, ignore_index= True)

def build_fourier_time_features(df : pd.DataFrame, 
                                time_levels: list, 
                                max_levels: list, 
                                drop_columns = False):
    """
    Transform time featuers in Fourier transformation
    """

    for time_level, max_level in zip(time_levels, max_levels):
       
        print("Generating Transformation for", time_level)

        if time_level == 'month':
            df.loc[:,f"{time_level}_sin"] = (df['timestamp']
                                    .dt.month
                                    ).apply(
                                    lambda x: np.sin( 
                                        2 * np.pi + x/max_level))
        if time_level == 'day':
            df.loc[:,f"{time_level}_sin"] = (df['timestamp']
                                    .dt.day
                                    ).apply(
                                    lambda x: np.sin( 
                                        2 * np.pi + x/max_level))
        if time_level == 'hour': 
            df.loc[:,f"{time_level}_sin"] = (df['timestamp']
                                    .dt.hour
                                    ).apply(
                                    lambda x: np.sin( 
                                        2 * np.pi + x/max_level))
        if time_level == 'minute':
            df.loc[:,f"{time_level}_sin"] = (df['timestamp']
                                    .dt.minute
                                    ).apply(
                                    lambda x: np.sin( 
                                        2 * np.pi + x/max_level))
    
def lag_features(data, n_lags, ref_variable):
    """
    Generate lag Features
    """
    
    lags_features = []

    if n_lags is not None:
        
        for lag in n_lags:

            columns_name = f'{ref_variable}_lag_{lag}'

            data.loc[:,columns_name] = (data
                                        .groupby(['Asset_ID'])
                                        [ref_variable]
                                        .transform(
                                        lambda x : x.shift(lag)))

            lags_features.append(columns_name) 
                
    return data, lags_features


def calculate_returns(data, variable, lags, binary_lags, outlier_cutoff):
    """
    Calculate Returns on a variable.
    """

    returns = []

    data.set_index(['timestamp', 'Asset_ID'], inplace = True)

    for lag in lags:
        if binary_lags:
            _return = returns.append(data[variable]
                        .sort_index() # Sort by Date
                        .pct_change(lag) # Calculate percentage change of the respective lag value
                        .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                upper=x.quantile(1-outlier_cutoff))) # Cutoff outliers
                        .add(1) # add 1 to the returns
                        .pow(1/lag) # apply n root for n = lag
                        .sub(1) #substract 1
                        .apply(lambda x: 1 if x > 0 else 0)
                        .to_frame(f'{variable}_return_{lag}m')
                        
                        )

        else:
            _return = returns.append(data[variable]
                    .sort_index() # Sort by Date
                    .pct_change(lag) # Calculate percentage change of the respective lag value
                    .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                            upper=x.quantile(1-outlier_cutoff))) # Cutoff outliers
                    .add(1) # add 1 to the returns
                    .pow(1/lag) # apply n root for n = lag
                    .sub(1) #substract 1
                    .to_frame(f'{variable}_return_{lag}m')
                    
                )

    returns.append(_return)
        
    returns = pd.concat(returns, axis = 1)
    #returns.info(null_counts=True)

    #data = data.set_index(['timestamp']).join(returns).dropna()
    data = data.join(returns).dropna()
    data.reset_index(inplace = True)

    return data

def rsi(x: pd.Series, periods = 13):
    
    # differentiation 1
    delta = x.diff(1)
    # Gains
    up = delta.clip(lower=0)
    # Losses
    down = delta.clip(upper=0).abs()
    
    # Exponential Weigted Average
    # alpha = 1/(1+com)
    ema_up = up.ewm(com=periods -1, min_periods = periods, adjust=False).mean()
    ema_down = down.ewm(com=periods -1, min_periods = periods, adjust=False).mean()
    
    rs = abs(ema_up/ema_down)


    return 100 - 100/(1+rs)

def calculate_rolling_features(df: pd.DataFrame, 
                            target_variable: str, 
                            windows: list, 
                            alpha: int = 0.8,
                            indicator = [], ):
    """Function to calculate rolling feature for a target variable in 
    a data frame. 
    
    Args:
        df (pandas.Dataframe): df input data frame
        target_variable (str): df containing the training samples
        windows (list): windows 
        alpha (float): alpha decay parameter for WMA features
        indicator (list): 'sma', 'std', 'bbands' and 'cv'
    
    Returns:
        pandas.Dataframe: original data frame containing the calculated features
    """

    if 'sma' in indicator:
        for w in windows:
            df[f'{target_variable}_sma_{w}'] = (df.groupby(['Asset_ID'])
                                                        [target_variable]
                                                        .transform(lambda x: 
                                                        x.rolling(window = w)
                                                        .mean()))
            
    if 'std' in indicator:
        for w in windows:
            df[f'{target_variable}_std_{w}'] = (df.groupby(['Asset_ID'])
                                                        [target_variable]
                                                        .transform(lambda x: 
                                                        x.rolling(window = w)
                                                        .std()))

    
    if 'wma_mean' in indicator:
        df[f'{target_variable}_wma_mean'] = (df.groupby(['Asset_ID'])
                                                    [target_variable]
                                                    .transform(lambda x: 
                                                    x.ewm(alpha = alpha)
                                                    .mean()))
    
    if 'wma_std' in indicator:
        df[f'{target_variable}_wma_std'] = (df.groupby(['Asset_ID'])
                                                    [target_variable]
                                                    .transform(lambda x: 
                                                    x.ewm(alpha = alpha)
                                                    .std()))



    if 'cv' in indicator:
        # TODO Reduce code
        if (not ('sma' in indicator)) or (not ('std' in indicator)):
            for w in windows:
                df[f'{target_variable}_sma_{w}'] = (df.groupby(['Asset_ID'])
                                                            [target_variable]
                                                            .transform(lambda x: 
                                                            x.rolling(window = w)
                                                            .mean()))

                df[f'{target_variable}_std_{w}'] = (df.groupby(['Asset_ID'])
                                                            [target_variable]
                                                            .transform(lambda x: 
                                                            x.rolling(window = w)
                                                            .std()))
        for w in windows: 
            df[f'{target_variable}_cv_{w}'] = (df[f'{target_variable}_std_{w}'] 
                                                    / df[f'{target_variable}_sma_{w}'] ) 
       
    
    if 'bbands' in indicator:
         
        # Calculate Typical Price
        # Ref: https://www.investopedia.com/terms/b/bollingerbands.asp
        target_variable = 'Typical_Price'
        df[target_variable] = ((df['High'] + df['Low'] + df['Close']) / 3)
            
        for w in windows:
            df[f'{target_variable}_sma_{w}'] = (df.groupby(['Asset_ID'])
                                                        [target_variable]
                                                        .transform(lambda x: 
                                                        x.rolling(window = w)
                                                        .mean()))

            df[f'{target_variable}_std_{w}'] = (df.groupby(['Asset_ID'])
                                                        [target_variable]
                                                        .transform(lambda x: 
                                                        x.rolling(window = w)
                                                        .std()))


            df[f'bblow_{w}'] = (df[f'{target_variable}_sma_{w}'] - 2 * 
                                                  df[f'{target_variable}_std_{w}'])

            df[f'bbhigh_{w}'] = (df[f'{target_variable}_sma_{w}'] + 2 * 
                                                    df[f'{target_variable}_std_{w}'])


    return df 

def momemtum_features(data, variable_returns, return_lags):
    """
    Generate momentum features
    """
    
    for lag in return_lags:
        if lag > return_lags[0]:
            data['momentum_{}_{}'.format( return_lags[0], lag)] = (data[f'{variable_returns}_return_{lag}m']
                                                            .sub(data['{}_return_{}m'
                                                            .format(variable_returns, return_lags[0])]))
        if lag > return_lags[1]:
            data['momentum_{}_{}'.format( return_lags[1], lag)] = (data[f'{variable_returns}_return_{lag}m']
                                                            .sub(data['{}_return_{}m'
                                                            .format(variable_returns, return_lags[1])]))
            
    return data

class MultipleTimeSeriesCV:
    """
    Generates tuples of train_idx, test_idx pairs.
    Assumes the MultiIndex contains levels 'symbol' and 'date'.
    Purges overlapping outcomes.
    """
    
    def __init__(self,
                n_splits=3,
                train_period_length=126,
                test_period_length=21,
                lookahead=None,
                date_idx = 'date',
                shuffle=False):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle
        self.date_idx = date_idx
    
    def split(self, X, y=None, groups=None):
        unique_dates = X.index.get_level_values(self.date_idx).unique()
        days=sorted(unique_dates, reverse=True)
        split_idx = []
        for i in range(self.n_splits):
            test_end_idx = i * self.test_length
            test_start_idx = test_end_idx + self.test_length
            train_end_idx = test_start_idx + self.lookahead -1
            train_start_idx = train_end_idx + self.train_length + self.lookahead -1
            split_idx.append([train_start_idx, train_end_idx,
                             test_start_idx, test_end_idx])
        
        dates = X.reset_index()[[self.date_idx]]
        for train_start, train_end, test_start, test_end in split_idx:
            train_idx = dates[(dates[self.date_idx] > days[train_start])
                             & (dates[self.date_idx] <= days[train_end])].index
            test_idx = dates[(dates[self.date_idx]> days[test_start])
                            & (dates[self.date_idx] <= days[test_end])].index
            
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx.to_numpy(), test_idx.to_numpy()
    
    def get_n_splits(self, X, y, groups=None):
        return self.n_splits
    
def feature_selection(df, train_range, test_range, X_columns, forecast_variable):
    """
    Feature selection.
    """

    df.reset_index(inplace = True)
    
    #Filter selected asset
    train = (df[(df['timestamp'] >= train_range[0]) & 
                    (df['timestamp'] < train_range[1])]
                    .copy(deep = True)
                    .set_index(['timestamp', 'Asset_ID'])
                    )

    test = (df[(df['timestamp'] >= test_range[0]) 
                & (df['timestamp'] <= test_range[1])]
                .copy(deep = True)
                .set_index(['timestamp', 'Asset_ID']))

    X_train = train.filter(X_columns)
    X_test = test.filter(X_columns)


    Y_train = train.filter([forecast_variable])
    Y_test = test.filter([forecast_variable])

    return X_train, X_test, Y_train, Y_test

def convert_to_float32(df):
    
    for col in df.columns:
        df.loc[:,col] = df[col].astype('float32')
        
    return df

# 1. Data Preprocessing

In [None]:
data_path = "/kaggle/input/g-research-crypto-forecasting/"

df_weights = pd.read_csv(os.path.join(data_path, 'asset_details.csv'), usecols=[0,1]).set_index('Asset_ID')
df_asset_details = pd.read_csv(os.path.join(data_path,"asset_details.csv"))
df_supp_train = pd.read_csv(os.path.join(data_path, "supplemental_train.csv"))

In [None]:
data = pd.read_csv(os.path.join(data_path, "train.csv"))
df_test = pd.read_csv(os.path.join(data_path,"example_test.csv"))

The test dataset validates models on Jun 13, 2021 data. Therefore models must be trained with data before that date. 

In [None]:
# Test data dates
print(pd.Series(df_test['timestamp'].unique()).apply(todatetime))

In [None]:
# Complete missing timestamps
data = (data.set_index('timestamp')
            .groupby(['Asset_ID'], 
            group_keys= False)
            .apply(reindex_by_date)
            .reset_index())

# Convert Unix timestamp to datetime 
data['timestamp'] = data['timestamp'].apply(todatetime)

# Reduced training data on filter
## latest_date = data['timestamp'].max()
historical_days = 5 # Historical days for model training
#latest_date = datetime.datetime(2021, 6,13) 
#date_filter = latest_date - dateutil.relativedelta.relativedelta(days = historical_days)

# Test Range
latest_date = datetime.datetime(2021, 11,1) 
date_filter = datetime.datetime(2021, 6,13) - dateutil.relativedelta.relativedelta(days = historical_days)

# Set date filter for historical data
data = data[(data['timestamp'] >= date_filter) & 
           (data['timestamp'] < latest_date)].copy(deep = True)

In [None]:
data['timestamp'].min()

In [None]:
data.groupby('Asset_ID').agg({'Close':len})

In [None]:
# Historical Data
df_hist = data.copy(deep = True)

# 2. Feature Engineering

In [None]:
def get_technical_indicators(data, technical_indicators):
    
    # Calculate Technical Indicators
    # Calculate rsi
    for ti in technical_indicators.keys():
        
        if ti == 'rsi':
            
            target_variable = technical_indicators[ti]['variable']
            periods = technical_indicators[ti]['periods']
            
            data[f'rsi_{periods}'] = (data.groupby(['Asset_ID'])
                                    [target_variable]
                                    .transform(lambda x: rsi(x, periods = periods)))
            
    return data

def get_features(data, technical_indicators, n_lags, 
                ref_variable_lags, outlier_cutoff, 
                return_lags, binary_lags, variable_returns, 
                rolling_features_params):
    """
    Get Features
    """
    
    # Time Features
    data['minute'] = data['timestamp'].dt.minute
    data['hour'] = data['timestamp'].dt.hour
    
    # Get Technical Indicators
    data = get_technical_indicators(data, technical_indicators)
    
    # Calculate Lag features
    data, lags_features = lag_features(data, n_lags, ref_variable_lags)

    # Calculate Returns Features
    data = calculate_returns(data, variable_returns, return_lags, binary_lags, outlier_cutoff)

    # Calculate Rolling Features
    for target_variable in rolling_features_params.keys():

        #print("Calculating rolling Features for ", target_variable)

        windows = rolling_features_params[target_variable]['windows']
        alpha = rolling_features_params[target_variable]['alpha']
        indicator = rolling_features_params[target_variable]['indicator']

        data = calculate_rolling_features(df = data, 
                                target_variable = target_variable, 
                                windows = windows,
                                alpha = alpha,
                                indicator = indicator)

    # Calculate momentum features
    data = momemtum_features(data, variable_returns, return_lags)
    
    return data

In [None]:
technical_indicators = {'rsi': {'periods': 15, 'variable': 'Close'}}

# Parameters for lag features
n_lags = [5, 15, 30]
ref_variable_lags = 'Close'

# Paramaters for return Features
outlier_cutoff = 0.01
return_lags = [1, 15, 30, 60] 
binary_lags = False
variable_returns = 'Close'

# Parameters for rolling features
rolling_features_params = {
    'Close':{
            'windows':[15],
            'alpha' : 0.8,
            'indicator' : ['wma_mean', 'wma_std','bbands']}, 

    'Volume':{
            'windows': None,
            'alpha' : 0.8,
            'indicator' : ['wma_mean', 'wma_std']}, 

    'VWAP': {
            'windows': None,
            'alpha' : 0.8,
            'indicator' : ['wma_mean', 'wma_std']}}

In [None]:
data = get_features(data, technical_indicators, n_lags, 
                ref_variable_lags, outlier_cutoff, 
                return_lags, binary_lags, variable_returns, 
                rolling_features_params)

In [None]:
data.columns

# 3. Model Training 

In [None]:
forecast_variable = 'Target'

train_columns = ['Count', 'Open', 'High', 'Low',
                   'minute', 'hour', 'rsi_15', 'Close_lag_5',
                   'Close_lag_15', 'Close_lag_30', 'Close_return_1m', 'Close_return_15m',
                   'Close_return_30m', 'Close_return_60m', 'Close_wma_mean',
                   'Close_wma_std', 'Typical_Price', 'Typical_Price_sma_15',
                   'Typical_Price_std_15', 'bblow_15', 'bbhigh_15', 'Volume_wma_mean',
                   'Volume_wma_std', 'VWAP_wma_mean', 'VWAP_wma_std', 'momentum_1_15',
                   'momentum_1_30', 'momentum_15_30', 'momentum_1_60', 'momentum_15_60']


In [None]:
assets = data['Asset_ID'].unique()

In [None]:
test_date = datetime.datetime(2021, 6,13)

# Historical days for model training
training_days = 5
train_range = [test_date - dateutil.relativedelta.relativedelta(days = historical_days), test_date]

# Historical days for model testing
hours_test_range = 1
test_range = [ test_date, test_date + dateutil.relativedelta.relativedelta(hours = hours_test_range)]
 

In [None]:
# load models
train_models = False
models = {}
if not train_models:
    
    models = {}
    input_folder_models = '/kaggle/input/cryptomodels'

    for asset_id in assets:

        model_loc = os.path.join(input_folder_models, f'model_asset_{asset_id}.pkl' )
        models[asset_id]= pickle.load(open(model_loc, 'rb'))

In [None]:
test = []
pred = []
start = time.time()

for asset_id in assets:

    df_train = data.copy(deep = True)
    df_train = df_train[df_train['Asset_ID'] == asset_id].copy(deep = True)
    df_train.set_index(['timestamp', 'Asset_ID'], inplace = True)
    df_train.dropna(inplace=True)

    
    # Make feature Selection
    X_train, X_test, Y_train, Y_test  = feature_selection(df_train, 
                                        train_range, 
                                        test_range, 
                                        train_columns, 
                                        forecast_variable, 
                                        )
    
    model = None
    
    if train_models:
        
        print('Training model for asset', asset_id)
        
        learning_rate = model_params[asset_id]['learning_rate']
        max_depth = model_params[asset_id]['max_depth']
        n_estimators = model_params[asset_id]['n_estimators']

        
        xgb_model = xgb.XGBRegressor(learning_rate = learning_rate, 
                                max_depth = max_depth, 
                                n_estimators = n_estimators)


        # executes bayesian optimization
        model = xgb_model.fit(X_train, Y_train)

        models[asset_id] = model
    else:
        model = models[asset_id]

    # Predict training set
    Y_pred = model.predict(X_test)

    # Set inde for Y_pred
    Y_pred = pd.DataFrame({'Y_pred': Y_pred,} ,index = Y_test.index)

    # Add Test and Pred Y
    test.append(Y_test)
    pred.append(Y_pred)
    
runtime = np.round((time.time() - start)/60, 3)

print("Runtime is:",  runtime)

In [None]:
# Check Feature Importance for Bitcoin Model
asset_id = 1
Importance = models[asset_id].best_estimator_.feature_importances_
df_importance = pd.DataFrame({'Variable': train_columns, 'Importance':Importance})
df_importance.sort_values(by = ['Importance'], ascending = False, inplace = True)
df_importance.head(10)

In [None]:
# join all predictions and all test values
df_test = pd.concat(test)
df_pred = pd.concat(pred)

df_master = df_test.join(df_pred).reset_index()

check_validation_test_data = True
if check_validation_test_data:
    df_master = (df_master[df_master['timestamp'].isin([datetime.datetime(2021,6,13,0,0,0),
                    datetime.datetime(2021,6,13,0,1,0) ,
                    datetime.datetime(2021,6,13,0,2,0)])].copy(deep = True))


# Calculate pearson r per cryptocurrency
df_performance = df_master.groupby(['Asset_ID']).apply(lambda x: np.corrcoef(x['Target'].values, x['Y_pred'].values)[0,1]).rename("Pearson").to_frame()

In [None]:
df_performance

In [None]:
df_performance = df_performance.join(df_weights)
weighted_performance = (df_performance['Pearson'] * df_performance['Weight'] ).sum() / df_performance['Weight'].sum()

print(" Weighted performance is :", np.round(weighted_performance, 4 ))

# 4. Model Testing

In [None]:
# Append new candle (minute) data to historical
def append_new_data(_df_hist, test_df, max_hist_hold, minute_test):
    """
    Append new minute data to historical data
    """
    
    # Make sure to filter out future data
    _df_hist = _df_hist[_df_hist['timestamp'] < minute_test].copy(deep = True)

    _df_hist = _df_hist.append(test_df).copy(deep = True)
    
    _df_hist.sort_values(by = ['timestamp', 'Asset_ID'], inplace = True)
    
    _df_hist = (_df_hist[
                _df_hist['timestamp'] >= max_hist_hold]
                .copy(deep = True))

    _df_hist.drop(columns = ['row_id'], inplace = True)
    
    return _df_hist

In [None]:
data.head()

In [None]:
import gresearch_crypto

env = gresearch_crypto.make_env()   # initialize the environment

iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

# df holding historical crypto data
_df_hist = df_hist.drop(columns = ['Target']).copy(deep = True)

# max. historical hours to hold for _df_hist
historical_minutes = 70

for (test_df, sample_prediction_df) in iter_test:
    
    asset_id_row_map = dict(zip(test_df['Asset_ID'].values, test_df['row_id'].values))

    # Transform new candle (minute) data to expected format
    test_df.loc[:,'timestamp'] = test_df['timestamp'].apply(todatetime)

    # Get Test Minute 
    minute_test = pd.to_datetime(test_df['timestamp'].values[0])

    # Set max of historical data to hold
    # to generate rolling features
    max_hist_hold = minute_test - dateutil.relativedelta.relativedelta(minutes = historical_minutes)

    # Append new test minute data to historical df
    #_df_hist = append_new_data(_df_hist, test_df, max_hist_hold, minute_test)

    # Generate Features
    #X_feat = get_features(_df_hist.copy(deep=True), technical_indicators, n_lags, 
    #                    ref_variable_lags, outlier_cutoff, 
     #                   return_lags, binary_lags, variable_returns, 
     #                   rolling_features_params)

    # Filter for features generated only for test minute
    #X_feat = (X_feat[X_feat['timestamp'] == minute_test]
    #        .filter(['timestamp', 'Asset_ID'] + train_columns )
    #        .copy(deep = True))
    
    # Test with already genertated features
    X_feat = (data[data['timestamp'] == minute_test]
            .filter(['timestamp', 'Asset_ID'] + train_columns )
            .copy(deep = True))
    
    for asset_id in assets:

        asset_feat = X_feat[X_feat['Asset_ID'] == asset_id].copy(deep = True)
        asset_feat.set_index(['timestamp', 'Asset_ID'], inplace = True)
        
        # Use last fitted modell on trainig data < 2021.06.13
        try:
            asset_model = models[asset_id]
            pred = asset_model.predict(asset_feat)[0]
            #print("Predicting for minute", minute_test, "predictig for asset", asset_id, ":", pred)
        except:
            print("Model could not predict")
            pred = 0
            
        sample_prediction_df.loc[:,'Target'] = np.where( sample_prediction_df['row_id'] == asset_id_row_map.get(asset_id), 
                                                  pred, 
                                                  sample_prediction_df['Target'])

    # add prediction
    env.predict(sample_prediction_df) 

