In [1]:
import os
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

log_pr = pd.read_pickle("../data/log_price.df")
volu = pd.read_pickle("../data/volume_usd.df")

## Feature Engineering

In [2]:
features = log_pr.reset_index().melt(id_vars=['timestamp'])
features.columns = ['timestamp', 'stock', 'log_pr']
features = features.set_index(['timestamp', 'stock']).sort_index()

### backward returns

In [3]:
for i in [3, 5, 10, 20]:
    features['log_pr_{}'.format(i)] = -features.groupby(level='stock').log_pr.diff(i)

In [253]:
log_pr.diff(30).iloc[-1]

0   -0.001984
1    0.001329
2   -0.003673
3   -0.000100
4    0.000413
5   -0.000011
6    0.002487
7   -0.000455
8    0.000299
9   -0.000356
Name: 2021-12-31 23:59:00, dtype: float64

In [256]:
-(log_pr.iloc[-1] - log_pr.iloc[-31])

0    0.001984
1   -0.001329
2    0.003673
3    0.000100
4   -0.000413
5    0.000011
6   -0.002487
7    0.000455
8   -0.000299
9    0.000356
dtype: float64

In [5]:
temp = log_pr.rolling(10).mean()
temp.head(10)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-07-01 00:00:00,,,,,,,,,,
2021-07-01 00:01:00,,,,,,,,,,
2021-07-01 00:02:00,,,,,,,,,,
2021-07-01 00:03:00,,,,,,,,,,
2021-07-01 00:04:00,,,,,,,,,,
2021-07-01 00:05:00,,,,,,,,,,
2021-07-01 00:06:00,,,,,,,,,,
2021-07-01 00:07:00,,,,,,,,,,
2021-07-01 00:08:00,,,,,,,,,,
2021-07-01 00:09:00,-0.000918,0.000536,0.001157,-0.000431,-0.003427,0.000505,-0.003746,0.000405,0.002177,-4e-05


### log volume and volume backward diffrence

In [6]:
volu_df = volu.reset_index().melt(id_vars=['timestamp'])
volu_df.columns = ['timestamp', 'stock', 'volu']
volu_df = volu_df.set_index(['timestamp', 'stock']).sort_index()
volu_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,volu
timestamp,stock,Unnamed: 2_level_1
2021-07-01,0,147506.9
2021-07-01,1,580811.6
2021-07-01,2,18600.58
2021-07-01,3,160135.3
2021-07-01,4,276596.0
2021-07-01,5,291628.9
2021-07-01,6,854425.5
2021-07-01,7,180150.9
2021-07-01,8,4151507.0
2021-07-01,9,7568141.0


In [7]:
log_fn = lambda x: np.log(x+1)
features['log_volu'] = volu_df.groupby(level='stock').volu.apply(log_fn)

In [8]:
for i in [1]:
    features['log_volu_{}'.format(i)] = features.groupby(level='stock').log_volu.diff(i)
    # features['volu_{}',format(i)] = volu_df.groupby(level='stock').volu.diff(i)

### Exponential Moving average


In [12]:
for i in [10, 30, 50]:
    ema = lambda x: x.ewm(span=i).mean()
    features['pr_ema_{}'.format(i)] = features.groupby(level='stock').log_pr.apply(ema)

### RSI

In [23]:
def rsi(close_delta, periods=20, ema=True):
    """
    Returns a pd.Series with the relative strength index.
    """
    close_delta = close_delta.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    if ema == True:
	    # Use exponential moving average
        ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
        ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    else:
        # Use simple moving average
        ma_up = up.rolling(window = periods, adjust=False).mean()
        ma_down = down.rolling(window = periods, adjust=False).mean()
        
    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

In [26]:
features['rsi_20'] = features.groupby(level='stock').log_pr.apply(rsi)
features['rsi_30'] = features.groupby(level='stock').log_pr.apply(rsi, periods=30)
features['rsi_50'] = features.groupby(level='stock').log_pr.apply(rsi, periods=50)

### Moving std. and Range of Oscillation

In [27]:
# std_5 = lambda x: x.rolling(5).std()
std_10 = lambda x: x.rolling(10).std()
features['pr_3_std_10'] = features.groupby(level='stock').log_pr_3.apply(std_10)

In [60]:
# def adr(log_pr, window=10, min_periods=10):
#     ma_max = lambda x: x.rolling(window=window, min_periods=min_periods).max()
#     ma_min = lambda x: x.rolling(window=window, min_periods=min_periods).min()

#     ma = lambda x: x.rolling(window=window, min_periods=min_periods).mean()
    
#     high = log_pr.apply(ma_max)
#     low = log_pr.apply(ma_min)

#     # average over window
#     ma_high = high.groupby(level='stock').apply(ma)
#     ma_low = low.groupby(level='stock').apply(ma)

#     ratio = ma_high/ma_low

#     return 100 * (ratio - 1)
# features['pr_adr_20'] = features.groupby(level='stock').log_pr.apply(adr)

In [64]:
k_period = 14
d_period = 3
ma_max = lambda x: x.rolling(k_period).max()
ma_min = lambda x: x.rolling(k_period).min()
mak = lambda x: x.rolling(k_period).mean()
mad = lambda x: x.rolling(d_period).mean()

features['pr_min_14'] = features.groupby(level='stock').log_pr.apply(ma_min)
features['pr_max_14'] = features.groupby(level='stock').log_pr.apply(ma_max)
features['pr_ma_14'] = features.groupby(level='stock').log_pr.apply(mak)

features['pr_so_14'] = (features['pr_ma_14'] - features['pr_min_14'])*100 / (features['pr_max_14'] - features['pr_min_14'])
features['pr_so_14d3'] = features.groupby(level='stock').pr_so_14.apply(mad)

In [218]:
log_pr.apply(ma_max).iloc[-3:].values

array([[ 0.20522693, -0.95368836,  1.82801342, -0.01300574, -0.45457584,
        -0.43331493, -0.24657942, -0.44382189,  0.05167313, -0.09703641],
       [ 0.20522693, -0.95368836,  1.82801342, -0.01305542, -0.45457584,
        -0.43331493, -0.24657942, -0.44382189,  0.05167313, -0.09703641],
       [ 0.20522693, -0.95346183,  1.82801342, -0.01305542, -0.45457584,
        -0.43331493, -0.24657942, -0.44382189,  0.05167313, -0.09703641]])

In [220]:
log_pr.rolling(k_period).max().iloc[-d_period:].values

array([[ 0.20522693, -0.95368836,  1.82801342, -0.01300574, -0.45457584,
        -0.43331493, -0.24657942, -0.44382189,  0.05167313, -0.09703641],
       [ 0.20522693, -0.95368836,  1.82801342, -0.01305542, -0.45457584,
        -0.43331493, -0.24657942, -0.44382189,  0.05167313, -0.09703641],
       [ 0.20522693, -0.95346183,  1.82801342, -0.01305542, -0.45457584,
        -0.43331493, -0.24657942, -0.44382189,  0.05167313, -0.09703641]])

### z score of volume

In [35]:
# roll_rank_fn = lambda x: x.rolling(240, min_period=20).apply(lambda x: pd.Series(x).rank(pct=True)[0])
zscore_fn = lambda x: (x - x.rolling(window=240, min_periods=20).mean()) / x.rolling(window=240, min_periods=20).std()
features['volu_z_score'] = volu_df.groupby(level='stock').volu.apply(zscore_fn)

In [36]:
volu_df.groupby(level='stock').volu.rolling(200, min_periods=20).max()

stock  timestamp            stock
0      2021-07-01 00:00:00  0                 NaN
       2021-07-01 00:01:00  0                 NaN
       2021-07-01 00:02:00  0                 NaN
       2021-07-01 00:03:00  0                 NaN
       2021-07-01 00:04:00  0                 NaN
                                         ...     
9      2021-12-31 23:55:00  9        6.669354e+07
       2021-12-31 23:56:00  9        6.669354e+07
       2021-12-31 23:57:00  9        6.669354e+07
       2021-12-31 23:58:00  9        6.669354e+07
       2021-12-31 23:59:00  9        6.669354e+07
Name: volu, Length: 2649600, dtype: float64

### Transform back to wide format

In [37]:
feature_df = features.reset_index(level=['stock']).sort_index()

In [41]:
temp = feature_df.pivot(columns ='stock')
temp.columns = temp.columns.get_level_values(0) + '_' +  [str(x) for x in temp.columns.get_level_values(1)]

In [43]:
temp.iloc[30:].isnull().any().all()

False

### Chaikin's Money Flow

In [None]:

def calculate_money_flow_volume_series(df: pd.DataFrame) -> pd.Series:
    """
    Calculates money flow series
    """
    mfv = df['volume'] * (2*df['close'] - df['high'] - df['low']) / \
                                    (df['high'] - df['low'])
    return mfv
features['pr_mf_']


### Organize Code

In [216]:
def remove_outliers(dta):
    # Compute the mean and interquartile range
    mean = dta.mean(0)
    iqr = dta.quantile([0.25, 0.75], axis=0).diff().T.iloc[:, 1]
    
    # Replace entries that are more than 10 times the IQR
    # away from the mean with NaN (denotes a missing entry)
    mask = np.abs(dta) > mean + 10 * iqr
    treated = dta.copy()
    treated[mask] = np.nan

    return treated

In [None]:
def rsi(close_delta, periods=20, ema=True):
    """
    Returns a pd.Series with the relative strength index.
    """
    close_delta = close_delta.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    if ema == True:
	    # Use exponential moving average
        ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
        ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    else:
        # Use simple moving average
        ma_up = up.rolling(window = periods, adjust=False).mean()
        ma_down = down.rolling(window = periods, adjust=False).mean()
        
    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

In [271]:
def wide_format(df):
    df_= df.reset_index(level=['stock']).sort_index()
    df_ = df_.pivot(columns ='stock')
    df_.columns = df_.columns.get_level_values(0) + '_' +  [str(x) for x in df_.columns.get_level_values(1)]

    return df_


def get_feature_train(log_pr, volu, x_begin_idx, x_end_idx, y_begin_idx, 
                        grp_idx=None, rm_outlier=False, print_cor=True):
    """
    Input:
    log_pr (pdSeries): train set
    volu (pdSeries): train set
    x_begin_idx (pdIndex): to truncate the NaNs
    grp_idx (dict): key is group idx, value is list of stock idx

    Returns:
    feature_dict (dict): key is group idx, value is a tuple of feature matrix and response
    """

    log_pr_df = log_pr.reset_index().melt(id_vars=['timestamp'])
    log_pr_df.columns = ['timestamp', 'stock', 'log_pr']
    log_pr_df = log_pr_df.set_index(['timestamp', 'stock']).sort_index()

    volu_df = volu.reset_index().melt(id_vars=['timestamp'])
    volu_df.columns = ['timestamp', 'stock', 'volu']
    volu_df = volu_df.set_index(['timestamp', 'stock']).sort_index()

    features = pd.DataFrame(index=log_pr_df.index)
    # features['trend'] = np.ones(log_pr_df.shape[0])

    # log_pr feature
    for i in [30]:
        features['log_pr_{}'.format(i)] = -log_pr_df.groupby(level='stock').log_pr.diff(i)

    k_period = 40
    d_period = 3
    ma_max = lambda x: x.rolling(k_period).max()
    ma_min = lambda x: x.rolling(k_period).min()
    mad = lambda x: x.rolling(d_period).mean()
    # msd = lambda x: x.rolling(d_period).sum()

    features['pr_min_40'] = log_pr_df.groupby(level='stock').log_pr.apply(ma_min)
    features['pr_max_40'] = log_pr_df.groupby(level='stock').log_pr.apply(ma_max)

    features['pr_so_40'] = (log_pr_df.log_pr - features['pr_min_40'])*100 / (features['pr_max_40'] - features['pr_min_40'])
    features['pr_so_40d3'] = features.groupby(level='stock').pr_so_40.apply(mad)

    # STD of log price
    for i in [10]:
        std = lambda x: x.rolling(i).std()
        features['log_pr_std_{}'.format(i)] = log_pr_df.groupby(level='stock').log_pr.apply(std)

    # RSI
    # features['rsi_20'] = log_pr_df.groupby(level='stock').log_pr.apply(rsi)
    features['rsi_30'] = log_pr_df.groupby(level='stock').log_pr.apply(rsi, periods=30)
    # features['rsi_50'] = log_pr_df.groupby(level='stock').log_pr.apply(rsi, periods=50)

    # volume feature
    log_fn = lambda x: np.log(x+1)
    features['log_volu'] = volu_df.groupby(level='stock').volu.apply(log_fn)

    # stdised volume in 2 hours backward rolling windows
    zscore_fn = lambda x: (x - x.rolling(window=240, min_periods=20).mean()) / x.rolling(window=240, min_periods=20).std()
    features['volu_z_score'] = volu_df.groupby(level='stock').volu.apply(zscore_fn)


    # drop min, max features
    features = features.drop(columns=['pr_min_40', 'pr_max_40', 'pr_so_40'])

    response = log_pr.diff(30)

    if grp_idx is not None:
        feature_dict = {}
        for key, idx_lis in grp_idx.items():
            feature_df_dropped = wide_format(features.loc[pd.IndexSlice[:,idx_lis],:])
            # transform back to wide format
            feature_dict[key] = (feature_df_dropped.iloc[x_begin_idx:x_end_idx], 
                                            response[idx_lis].iloc[y_begin_idx:])
        return feature_dict
    else:
        # transform back to wide format
        feature_df_dropped = wide_format(features).iloc[x_begin_idx:x_end_idx]
        # feature_df_dropped = feature_df[x_begin_idx:x_end_idx]
    
        if print_cor:
            for i in range(10):
                
                feature_train_0 = features.xs(i, level='stock').iloc[x_begin_idx:x_end_idx]
                print(feature_train_0.corrwith(response[i]))
                print(feature_train_0.isnull().sum())

        return feature_df_dropped, response.iloc[y_begin_idx:]

In [264]:
a = np.random.normal(0,1,(2,3))
print(a)
print(a.mean(0))

[[-0.2515334   1.09632419 -1.98167815]
 [ 1.77589998 -0.69245367  3.15644929]]
[0.76218329 0.20193526 0.58738557]


In [265]:
def rsi_test(log_pr, periods=20):
    """
    Returns a pd.Series with the relative strength index.
    """
    close_delta = log_pr.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    # Use exponential moving average
    ma_up = up.ewm(com=periods-1, adjust=True, min_periods=periods).mean().iloc[-1]
    ma_down = down.ewm(com=periods-1, adjust=True, min_periods=periods).mean().iloc[-1]
        
    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

In [280]:
def wide_format_test(df):
    df_= df.reset_index()
    df_ = df_.pivot(columns ='index').apply(lambda s: s.dropna().reset_index(drop=True))
    df_.columns = df_.columns.get_level_values(0) + '_' +  [str(x) for x in df_.columns.get_level_values(1)]

    return df_

def get_feature_test(log_pr, volu, grp_idx=None):
    """
    Input: 
    log_pr (pdSeries): 1 day of log pr 
    volu (pdSeries): 1 day of volume

    Output:
    test data frame
    """
    features = pd.DataFrame(index=log_pr.columns)
    print(log_pr.index[-1])
    features['trend'] = np.ones(log_pr.shape[1])
    # backward return
    # print(-(log_pr.iloc[-1] - log_pr.iloc[-30]).values)
    # for i in [30]:
    features['log_pr_30'] = -(log_pr.iloc[-1] - log_pr.iloc[-31]).values
    
    # Oscilator
    k_period = 40
    d_period = 3
    pr_min_40 = log_pr.rolling(k_period).min().iloc[-d_period:].values
    pr_max_40 = log_pr.rolling(k_period).max().iloc[-d_period:].values
    pr_so_40 = (log_pr.iloc[-d_period:].values - pr_min_40)*100 / (pr_max_40 - pr_min_40)
    features['pr_so_40d3']  = pr_so_40.mean(0)

    # backward rolling std
    # features['log_pr_std_10'] = log_pr.iloc[-10:].std(0).values
    features['log_pr_std_30'] = log_pr.iloc[-30:].std(0).values
    
    # RSI
    features['rsi_30'] = log_pr.apply(rsi_test, periods=30)

    # volume features
    features['log_volu'] = np.log(volu.iloc[-1].values + 1)
    features['volu_z_score'] = ((volu.iloc[-1] - volu.iloc[-240:].mean())/volu.iloc[-240:].std()).values

    print(volu.iloc[-240:].mean())

    if grp_idx is None:
        return wide_format_test(features)
    else:
        df_dict = {}
        for key, idx_lis in grp_idx.items():
            df_dict[key] = wide_format_test(features.loc[idx_lis])
        return df_dict

In [228]:
       
    # # Chaikin's money flow
    # features['mf_40'] = volu_df.volu * ((2*log_pr_df.log_pr - features['pr_min_40'])
    #                             / (features['pr_max_40'] - features['pr_min_40']))
    # features['mf_40_ma'] = (features.groupby(level='stock').mf_40.apply(msd) / 
    #                         volu_df.groupby(level='stock').volu.apply(msd))
       
#    if rm_outlier:
#         feature_df_dropped = remove_outliers(feature_df)
#         na_idx = feature_df_dropped.isnull().any(1)
#         response_dropped[na_idx] = np.nan

#         return feature_df_dropped.dropna(), response_dropped.dropna()

In [272]:
t_train = log_pr.index[-87841]
log_pr_train = log_pr[:t_train]
volu_train = volu[:t_train]

x_begin_idx = 41
x_end_idx = -30
y_begin_idx = 71

grp_idx = {i:[i] for i in range(10)}
# grp_idx = {0:[1,5,6,8], 1:[2,4,7], 2:[0,3,9]}
feature_dict = get_feature_train(log_pr_train, volu_train, x_begin_idx, x_end_idx, y_begin_idx, grp_idx=None)

trend                 NaN
log_pr_30       -1.000000
pr_so_40d3       0.686567
log_pr_std_10    0.294357
log_pr_std_30    0.302889
rsi_30           0.772883
log_volu         0.137940
volu_z_score     0.157187
dtype: float64
trend            0
log_pr_30        0
pr_so_40d3       0
log_pr_std_10    0
log_pr_std_30    0
rsi_30           0
log_volu         0
volu_z_score     0
dtype: int64
trend                 NaN
log_pr_30       -1.000000
pr_so_40d3       0.576741
log_pr_std_10    0.353351
log_pr_std_30    0.381090
rsi_30           0.680246
log_volu         0.138583
volu_z_score     0.188721
dtype: float64
trend            0
log_pr_30        0
pr_so_40d3       0
log_pr_std_10    0
log_pr_std_30    0
rsi_30           0
log_volu         0
volu_z_score     0
dtype: int64
trend                 NaN
log_pr_30       -1.000000
pr_so_40d3       0.655153
log_pr_std_10    0.308078
log_pr_std_30    0.380294
rsi_30           0.747689
log_volu         0.087897
volu_z_score     0.168670
dtype: float64
t

In [273]:
grp_idx = {i:[i] for i in range(10)}

temp = get_feature_test(log_pr_train.iloc[:1440], volu_train.iloc[:1440], grp_idx)

2021-07-01 23:59:00
0    2.240485e+05
1    3.834046e+05
2    1.886689e+04
3    1.785163e+05
4    4.754693e+04
5    3.309183e+05
6    4.358940e+05
7    1.688329e+05
8    3.870659e+06
9    1.149428e+07
dtype: float64


In [275]:
temp[1]

Unnamed: 0,trend_1,log_pr_30_1,pr_so_40d3_1,log_pr_std_10_1,log_pr_std_30_1,rsi_30_1,log_volu_1,volu_z_score_1
0,1.0,0.003507,25.188736,0.000627,0.0014,42.94084,12.586386,-0.319731
