In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt # Load data faster
import seaborn as sns 
import matplotlib.pyplot as plt

Loading the data:

In [None]:
asset_details = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv', index_col='Asset_ID')
names = asset_details.sort_index().Asset_Name.values
asset_dfs = []
for name in names:
    asset_df = dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-preprocessing-2/{name.lower().replace(' ', '_')}.jay").to_pandas()
    
    # Needed for some feature calculations
    asset_df['diffs'] = asset_df['Close'] - asset_df['Close'].shift(1)
    asset_df['signs_diffs'] = np.sign(asset_df['VWAP'] - asset_df['VWAP'].shift(1))
    
    asset_dfs.append(asset_df)

market = dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-preprocessing-2/market.jay").to_pandas()
asset_dfs[1].head(10) # Bitcoin

In [None]:
def reduce_mem_usage(df, verbose=False):
    for col in df.columns:
        if df[col].dtype == 'int64':
            datatypes = [np.int8, np.int16, np.int32]
            # Go through all possible smaller datatypes and assign the smallest one that can fit all rows.
            for datatype in datatypes:
                min_int, max_int = np.iinfo(datatype).min, np.iinfo(datatype).max
                if df[col].max() <= max_int and df[col].min() >= min_int:
                    df[col] = df[col].astype(datatype, copy=False)
                    if verbose:
                        print(f'Convert {col} to {datatype}.')
                    break
        elif df[col].dtype == 'int32':
            datatypes = [np.int8, np.int16]
            # Go through all possible smaller datatypes and assign the smallest one that can fit all rows.
            for datatype in datatypes:
                min_int, max_int = np.iinfo(datatype).min, np.iinfo(datatype).max
                if df[col].max() <= max_int and df[col].min() >= min_int:
                    df[col] = df[col].astype(datatype, copy=False)
                    if verbose:
                        print(f'Convert {col} to {datatype}.')
                    break
        elif df[col].dtype == 'int16':
            datatypes = [np.int8]
            # Go through all possible smaller datatypes and assign the smallest one that can fit all rows.
            for datatype in datatypes:
                min_int, max_int = np.iinfo(datatype).min, np.iinfo(datatype).max
                if df[col].max() <= max_int and df[col].min() >= min_int:
                    df[col] = df[col].astype(datatype, copy=False)
                    if verbose:
                        print(f'Convert {col} to {datatype}.')
                    break
                    
        elif df[col].dtype == 'float64':
            datatypes = [np.float32]
            # If minimum value of column is more than the approximate resolution of the data type (smallest value that remains precise), we can switch to it
            for datatype in datatypes:
                resolution = np.finfo(datatype).resolution
                if df[col].min() > resolution:
                    df[col] =df[col].astype(datatype, copy=False)
                    if verbose:
                        print(f'Convert {col} to {datatype}.')
                    break

Some of the more complex features go into separate functions:

In [None]:
def ichimoku(df, period_length=60):
    #Calculate the conversion line
    high_9 = df['High'].rolling(window=9*period_length).max()
    low_9 = df['Low'].rolling(window=9*period_length).min()
    tenkan_sen = (high_9 + low_9) / 2
    
    #Calculate the base line
    high_26 = df['High'].rolling(window=26*period_length).max()
    low_26 = df['Low'].rolling(window=26*period_length).min()
    kijun_sen = (high_26 + low_26) / 2
    
    #Calculate leading span A
    senkou_span_a = ((tenkan_sen + kijun_sen) / 2).shift(26*period_length)

    #Calculate leading span B
    high_52 = df['High'].rolling(window=52*period_length).max()
    low_52 = df['Low'].rolling(window=52*period_length).min()
    senkou_span_b = ((high_52 + low_52) / 2).shift(26*period_length)

    # Calculate lagging span
    chikou_span = df['Close'].shift(-22*period_length) #sometimes -26 
    
    return tenkan_sen, kijun_sen, senkou_span_a, senkou_span_b, chikou_span

In [None]:
def Stochastic(df, periods=14):
    #Calculate highs and lows of price over 14 periods
    H_14 = df['Close'].rolling(window=periods).max()
    L_14 = df['Close'].rolling(window=periods).min()
    
    #Calculate value of fast stochastic indicator
    stochastic_k = ((df['Close'] - L_14)/(H_14 - L_14)) * 100
    stochastic_k.fillna(100, inplace=True)
    
    #Calculate value of slow stochastic indicator
    stochastic_d = stochastic_k.rolling(window = 3).mean()
    
    return stochastic_k, stochastic_d

In [None]:
def MoneyFlow(df, periods=14):    
    #Calculate moving average of typical price
    TP = df['VWAP'].to_numpy()
    
    #Calculate the raw money flow
    MF = df['signs_diffs'] * TP * df['Volume']
    pos_MF = MF.clip(lower=0)
    neg_MF = -1 * MF.clip(upper=0)
    
    #Determine money flow ratio
    MFR = pos_MF.rolling(periods).sum()/neg_MF.rolling(periods).sum()
    MFR.fillna(1, inplace=True)
    #Determine money flow index
    return 100 - (100 / (1 + MFR))

In [None]:
def rsi(df, periods=14):
    # periods: Amount of periods (minute long) the RSI will be calculated with
    
    # The gain and loss values relative strength uses, gain is simply the difference between close prices but 0 if it is negative, opposite for loss
    gain = df['diffs'].clip(lower=0)
    loss = -1 * df['diffs'].clip(upper=0)
    mean = None
    
    def smma(window):            
        nonlocal mean
        # Smoothed moving average
        if mean is not None:
            mean = (mean * (len(window)-1) + window[-1]) / periods
        else:
            mean = window.mean()
        return mean
    
    
    # Now we have to calculate a SMMA over it to get the RS
    gain_mean = gain.rolling(periods).apply(smma, raw=True)

    mean = None
    loss_mean = loss.rolling(periods).apply(smma, raw=True)
    
    rs = gain_mean/loss_mean
    
    rsi = (100 - 100/(1+rs))
    return gain_mean, loss_mean, rsi.fillna(100)
    
    

In [None]:
def obv_change(df):
    return df['signs_diffs'] * df['Volume']

In [None]:
def ad_line(df):
    df['MFM'] = ((df['Close'] - df['Low']) - (df['High'] - df['Close'])) / (df['High'] - df['Low'])
    return df['Volume'] * df['MFM']

In [None]:
def beta(df, market_df, periods): # The beta as calculated in the tutorial for the target
    close_col = df['Close']
    numerator = (market_df['log_ret'] * np.log(close_col/close_col.shift(15))).rolling(periods).mean()
    denominator = (market_df['log_ret']**2).rolling(periods).mean()
    return numerator/denominator
    

Now we define a function to engineer the features for each dataframe:

In [None]:
def engineer_features(df):
    OBVs = obv_change(df)
    AD_line = ad_line(df)    
    df['ema_24'] = df['Close'].ewm(span=24).mean()
    df['ema_52'] = df['Close'].ewm(span=52).mean()
    df['MACD_signal'] = (df['ema_24'] - df['ema_52']).ewm(span=18).mean()
    df['MACD_crossover_norm'] = (df['ema_24'] - df['ema_52'] - df['MACD_signal']) / df['MACD_signal']
    df['gain_mean'], df['loss_mean'], df['RSI'] = rsi(df, periods=28)
    df['stochastic_k'], df['stochastic_d'] = Stochastic(df)
    df['stochastic_crossover'] = df['stochastic_k'] - df['stochastic_d']
    df['log_ret1'] = np.log(df['Close'] / df['Close'].shift(1))
    df['log_ret30'] = df['log_ret1'].rolling(30).sum()
    df['log_ret240'] = df['log_ret1'].rolling(240).sum() # 4 hours
    df['log_ret1440'] = df['log_ret1'].rolling(1440).sum() # 1 day
    
    df['mfi'] = MoneyFlow(df, periods=28)
#     df['MFI_divergence'] = df['log_ret30'] - np.log(df['mfi'] / df['mfi'].shift(1)).rolling(30).sum() # Cum. Log returns compared to cum. log MFI change over 30 mins

    #df['beta_3750'] = beta(df, market, 3750)
#     df['sin_240'] = np.sin(np.linspace(0, 2*np.pi*(len(df)/240), len(df))) # 4 hours
#     df['sin_1440'] = np.sin(np.linspace(0, 2*np.pi*(len(df)/240), len(df))) # A day
    
    df = df.drop(['Count', 'High', 'Low', 'Open', 'Close', 'Volume', 'VWAP', 'diffs', 'signs_diffs', 'MFM'], axis=1) 
        
    # Remove the first n rows where n is the maximum lag period, otherwise we have nans for those.
    return df.iloc[3750+15:]

Let's test this on bitcoin data:

In [None]:
bitcoin = asset_dfs[1].copy()
bitcoin = engineer_features(bitcoin)

In [None]:
bitcoin[bitcoin.isnull().values]

In [None]:
bitcoin.head(50)

**One remark is that we now loop over all observations whereas in case of testing we will receive one observation after the other. In that case we do not want to loop over the whole dataset everytime but just compare the gotten observation with the previous one.**

In [None]:
# def obv(df):
#     df['Obv'] = df['Volume'][0]
#     for i in range(len(df)):
#         if i > 0:
#             j = i - 1
#             if df['Close'][i] > df['Close'][j]:
#                 df['Obv'][i] = df['Obv'][j] + df['Volume'][i]
#             elif df['Close'][i] < df['Close'][j]:
#                 df['Obv'][i] = df['Obv'][j] - df['Volume'][i]
#             else:
#                 df['Obv'][i] = df['Obv'][j]
        
#     return df
            
#obv_test = obv(asset_dfs[1])

In [None]:
# sns.set_theme()
# plt.plot(bitcoin['24h OBV change MA'], label='24h obv change')
# plt.legend()
# plt.show()

In [None]:
# plt.plot(bitcoin['1h sine'][:5*60], label='1h sine')
# plt.legend(loc='lower right')
# plt.show()

In [None]:
# plt.plot(bitcoin['MACD'], label='MACD')
# plt.legend(loc='lower right')
# plt.show()

In [None]:
# plt.plot(bitcoin['beta_3750'], label='3750m beta')
# plt.legend(loc='lower right')
# plt.show()

In [None]:
# def ad_line(df):
#     df['MFM'] = ((df['Close'] - df['Low']) - (df['High'] - df['Close'])) / (df['High'] - df['Low'])
#     df['MFV'] = df['Volume'] * df['MFM']
#     df['AD_line'] = df['MFV'][0]
#     for i in range(len(df)):
#         if i > 0:
#             df['AD_line'][i] = (df['AD_line'][i-1] + df['MFV'][i])
    
#     return df
# #ad_line_test = ad_line(asset_dfs[1])

In [None]:
# sns.set_theme()
# #plt.plot(ad_line_test['AD_line'], label='AD_line')
# plt.legend()
# plt.show()

In [None]:
# sns.set_theme()
# plt.plot(bitcoin['HLCO ratio'], label='HLCO ratio')
# plt.legend()
# plt.show()

In [None]:
# plt.figure(figsize=(20, 10))
# for period in periods:
#     plt.plot(bitcoin[f'{period}h log returns'], label=f'{period}h log returns')
# plt.legend(loc='lower right')
# plt.show()

In [None]:
# plt.figure(figsize=(20, 10))
# plt.plot(bitcoin['MACD_crossover_norm'], label='MACD crossover')
# plt.legend(loc='lower right')
# plt.show()

In [None]:
for asset_id, df in enumerate(asset_dfs):
    df = engineer_features(df).reset_index()
    name = asset_details.loc[asset_details.index == asset_id, 'Asset_Name'][asset_id].lower().replace(' ', '_')
    reduce_mem_usage(df, verbose=True)
    dt.Frame(df).to_jay(f'{name}.jay')