In [143]:
import os
import gc
import traceback
import numpy as np
import pandas as pd
import seaborn as sns
# import gresearch_crypto
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
from tensorflow.keras import layers
# import tensorflow_probability as tfp # -> Error : annot import name 'naming' from 'tensorflow.python.autograph.core' 
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
pd.set_option('display.max_columns', None)

# relative imports
# from get_feats import get_features
from helperfuncs import get_time_fractions

#basics 
from datetime import datetime

In [147]:
def get_features(DATA_ROOT, nrows):
    df = pd.read_csv(DATA_ROOT+"train.csv", nrows=nrows)

    assets = pd.read_csv(DATA_ROOT+"asset_details.csv")
    df_grouped = df.groupby("Asset_ID")

    # we will work with other 12 assets
    btc = df_grouped.get_group(1)
    eth = df_grouped.get_group(6)

    btc.set_index("timestamp", inplace=True)
    btc = btc.drop(columns=["Target", "Asset_ID"])
    btc = btc.add_suffix("_btc")
    eth.set_index("timestamp", inplace=True)
    eth = eth.drop(columns=["Target", "Asset_ID"])
    eth = eth.add_suffix("_eth")

    # working(1/3:15)
    # btc_eth = pd.concat([btc, eth], join="outer", axis=1)
    # btc_eth_index = btc_eth.index.unique()


    
    #filling nans for now(reindex and drop later)
    # getting rid of filling nans
    # btc_eth.fillna(method="ffill", inplace=True)

    df_features = btc_eth.copy()
    suffixes = ["_btc", "_eth"]
    for suffix in suffixes:
        df_features["logprice"+suffix] = np.log(df_features["Close"+suffix]) 
        df_features["Volatility"+suffix] = np.log(df_features["High"+suffix])\
            - np.log(df_features["Close"+suffix])
        df_features = df_features.drop(columns=["Close"+suffix, "High"+suffix,\
            "Low"+suffix, "Open"+suffix, "VWAP"+suffix])

    datetimes = pd.Series(df_features.index).astype("datetime64[s]")
    df_features["frac_day"], df_features["frac_week"], df_features['frac_of_month'], \
        df_features['frac_of_year'] = zip(*datetimes.map(get_time_fractions))


    # calculate 2-asset targets(not 14 asset target)
    df_logprices = df_features[["logprice_btc", "logprice_eth"]]
    # 𝑅𝑎(𝑡)=𝑙𝑜𝑔(𝑃𝑎(𝑡+16) / 𝑃𝑎(𝑡+1))=𝑙𝑜𝑔(𝑃𝑎(𝑡+16)−𝑙𝑜𝑔(𝑃𝑎(𝑡+1)
    df_returns = df_logprices.shift(-16) - df_logprices.shift(-1)
    for suffix in suffixes:
        df_returns.rename(columns={"logprice"+suffix : "R"+suffix}, inplace=True)

    # find a better way to write next line
    assets =  assets[(assets["Asset_ID"] == 1) | (assets["Asset_ID"] == 6)]
    assets = assets.sort_values(by=["Asset_ID"])
    weights = assets["Weight"].to_numpy()
    weights = weights.reshape(len(weights), 1)


    R = df_returns.to_numpy()# to array
    weights_sum = np.sum(weights)
    M = np.dot(R, weights) / weights_sum # weighted average => log_btc*w_btc + log_eth*w_eth
    df_M = pd.DataFrame(data=M, index=df_returns.index, columns=["M"])
    R.shape,weights.shape, M.shape


    df_R_M = df_returns.copy()
    for col in df_R_M.columns:
        df_R_M[col] = df_R_M[col] * df_M["M"] # calculated R・M here
    for suffix in suffixes:
        df_R_M.rename(columns={"R"+suffix:"R_M"+suffix}, inplace=True)
    df_R_M_rolling = df_R_M.rolling(window=3750).mean()


    # creating M^2 
    df_M2 = df_M ** 2
    df_M2.rename(columns={"M" : "M2"}, inplace = True)
    df_M2_rolling = df_M2.rolling(window=3750).mean()
    df_betas = df_R_M_rolling.copy()    
    for col in df_betas.columns: # columns = [R_M_btc	R_M_eth]   
        df_betas[col] = df_betas[col] / df_M2_rolling["M2"] # caculating <R・M>/<M^2> here
    for suffix in suffixes: # beta = <R・M>/<M^2> 
        df_betas.rename(columns={"R_M"+suffix : "beta"+suffix}, inplace = True)
    df_targets = df_returns.copy()
    for suffix in suffixes:
        df_targets["R"+suffix] -= df_betas["beta"+suffix] * df_M["M"] # R^a - β^a
        df_targets.rename(columns={"R"+suffix: "Target"+suffix}, inplace=True)

    df_features_targets = pd.concat([df_features, df_betas, df_targets], axis=1)
    df_features_targets = df_features_targets.iloc[3750:-16] # drop nan rows

    df_features_targets["datetime"] = df_features_targets.index
    df_features_targets["datetime"] = df_features_targets["datetime"].apply(datetime.fromtimestamp)

    return df_features_targets

# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        
        if col_type not in ['object', 'category', 'datetime64[ns, UTC]', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

def reindex(df, index):
    df = df.reindex(range(index[0], index[-1]+60), method="nearest")
    df = df.fillna(method="ffill").fillna(method="bfill")
    return df

In [144]:
DATA_ROOT = "../data/"



DEVICE = "CPU"
SEED = 42
EPOCHS = 10
DEBUG = True
N_ASSETS = 14
WINDOW_SIZE = 15
BATCH_SIZE = 1024
PCT_VALIDATION = 10

In [148]:
# load data

week_secs = 60*60*24*7
month_secs = 60*60*24*31
day_secs = 60*60*24
# dataset is formed by 14 assets
if DEBUG:
    df = get_features(DATA_ROOT, nrows=week_secs)
else:
    df = get_features(DATA_ROOT, nrows=None)

df = reduce_mem_usage(df)

# getting rid of "beta_eth", "Target_eth" 
df = df[['Count_btc', 'Volume_btc', 'Count_eth', 'Volume_eth', 'logprice_btc',
       'Volatility_btc', 'logprice_eth', 'Volatility_eth', 'frac_day',
       'frac_week', 'frac_of_month', 'frac_of_year', 'beta_btc',
       'Target_btc']]

# adding datetime column
if DEBUG:
    df_vis = df.copy()
    df_vis["datetime"] = df_vis.index
    df_vis.datetime = df_vis.datetime.apply(datetime.fromtimestamp)

Memory usage of dataframe is 9.82 MB
Memory usage after optimization is: 3.27 MB
Decreased by 66.7%


In [138]:
# about training
LOAD_STRICT = True

INC2021 = 0
INC2020 = 0
INC2019 = 0
INC2018 = 0
INC2017 = 0
INCCOMP = 1
INCSUPP = 0

train = df
sample_prediction = pd.read_csv(DATA_ROOT+"example_sample_submission.csv")

# the notebooks uses upper/lower_shadow, open_sub_close, seasonality

In [151]:
train = train.sort_index()
index = train.index.unique()
print(train.shape)
train = train.apply(reindex).reset_index(0, drop=True).sort_index()
gc.collect()
print(train.shape)

(1952639, 14)


TypeError: reindex() missing 1 required positional argument: 'index'