# OLS

The linear method makes use of the log backward return (log price difference) to predict foward return.

Training:
1) Ridge regression: on 30 features
2) PC regression: pca on 30 features then perform ols

Feature: 10 stocks, each with 3 backward return (say, 3min, 7min, 10min, see correlation to decide)

Response: 10 stocks' 30min forward return. 

Groups: [1,4,5,6,8],[0,2,3,7,9]

## Data Preparation

In [1]:
import os
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

log_pr = pd.read_pickle("../data/log_price.df")
volu = pd.read_pickle("../data/volume_usd.df")

## Feature Engineering

In [68]:
def wide_format(df):
    df_= df.reset_index(level=['stock']).sort_index()
    df_ = df_.pivot(columns ='stock')
    df_.columns = df_.columns.get_level_values(0) + '_' +  [str(x) for x in df_.columns.get_level_values(1)]

    return df_

def rsi(close_delta, periods=20, ema=True):
    """
    Returns a pd.Series with the relative strength index.
    """
    close_delta = close_delta.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    if ema == True:
	    # Use exponential moving average
        ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
        ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    else:
        # Use simple moving average
        ma_up = up.rolling(window = periods, adjust=False).mean()
        ma_down = down.rolling(window = periods, adjust=False).mean()
        
    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

def get_feature_train(log_pr, volu, x_begin_idx, x_end_idx, y_begin_idx, 
                        grp_idx=None, rm_outlier=False, print_cor=True):
    """
    Input:
    log_pr (pdSeries): train set
    volu (pdSeries): train set
    x_begin_idx (pdIndex): to truncate the NaNs
    grp_idx (dict): key is group idx, value is list of stock idx

    Returns:
    feature_dict (dict): key is group idx, value is a tuple of feature matrix and response
    """

    log_pr_df = log_pr.reset_index().melt(id_vars=['timestamp'])
    log_pr_df.columns = ['timestamp', 'stock', 'log_pr']
    log_pr_df = log_pr_df.set_index(['timestamp', 'stock']).sort_index()

    volu_df = volu.reset_index().melt(id_vars=['timestamp'])
    volu_df.columns = ['timestamp', 'stock', 'volu']
    volu_df = volu_df.set_index(['timestamp', 'stock']).sort_index()

    features = pd.DataFrame(index=log_pr_df.index)

    # log_pr feature
    for i in [20, 30]:
        features['log_pr_{}'.format(i)] = -log_pr_df.groupby(level='stock').log_pr.diff(i)

    # # EMA
    # ema = lambda x: x.ewm(span=i).mean()
    # for i in [10, 30, 50]:
    #     features['pr_ema_{}'.format(i)] = log_pr_df.groupby(level='stock').log_pr.apply(ema)

    # # MA
    # for i in [10, 30, 50]:
    #     ma = lambda x: x.rolling(i).mean()
    #     features['pr_ma_{}'.format(i)] = log_pr_df.groupby(level='stock').apply(ma)

    k_period = 40
    d_period = 3
    ma_max = lambda x: x.rolling(k_period).max()
    ma_min = lambda x: x.rolling(k_period).min()
    mad = lambda x: x.rolling(d_period).mean()
    msd = lambda x: x.rolling(d_period).sum()

    features['pr_min_40'] = log_pr_df.groupby(level='stock').log_pr.apply(ma_min)
    features['pr_max_40'] = log_pr_df.groupby(level='stock').log_pr.apply(ma_max)

    features['pr_so_40'] = (log_pr_df.log_pr - features['pr_min_40'])*100 / (features['pr_max_40'] - features['pr_min_40'])
    features['pr_so_40d3'] = features.groupby(level='stock').pr_so_40.apply(mad)

    # STD of log price
    for i in [30]:
        std = lambda x: x.rolling(i).std()
        features['log_pr_std_{}'.format(i)] = log_pr_df.groupby(level='stock').log_pr.apply(std)

    # RSI
    # features['rsi_20'] = log_pr_df.groupby(level='stock').log_pr.apply(rsi)
    features['rsi_30'] = log_pr_df.groupby(level='stock').log_pr.apply(rsi, periods=30)
    # features['rsi_50'] = log_pr_df.groupby(level='stock').log_pr.apply(rsi, periods=50)

    # volume feature
    log_fn = lambda x: np.log(x+1)
    features['log_volu'] = volu_df.groupby(level='stock').volu.apply(log_fn)

    # stdised volume in 2 hours backward rolling windows
    zscore_fn = lambda x: (x - x.rolling(window=30, min_periods=20).mean()) / x.rolling(window=240, min_periods=20).std()
    features['volu_z_score'] = volu_df.groupby(level='stock').volu.apply(zscore_fn)

    # # Chaikin's money flow
    # features['mf_40'] = volu_df.volu * ((2*log_pr_df.log_pr - features['pr_min_40'])
    #                             / (features['pr_max_40'] - features['pr_min_40']))
    # features['mf_40_ma'] = (features.groupby(level='stock').mf_40.apply(msd) / 
    #                         volu_df.groupby(level='stock').volu.apply(msd))

    # feature_dropped = features.iloc[x_begin_idx:x_end_idx]
    response = log_pr.diff(30)
    # print(features.shape)
    # print(feature_dropped.shape)
    # print(response_dropped.shape)

    if grp_idx is not None:
        feature_dict = {}
        for key, idx_lis in grp_idx.items():
            feature_df_dropped = wide_format(features.loc[pd.IndexSlice[:,idx_lis],:])
            # transform back to wide format
            feature_dict[key] = (feature_df_dropped.iloc[x_begin_idx:x_end_idx], 
                                            response[idx_lis].iloc[y_begin_idx:])
        return feature_dict
    else:
        # transform back to wide format
        feature_df_dropped = wide_format(features).iloc[x_begin_idx:x_end_idx]
        # feature_df_dropped = feature_df[x_begin_idx:x_end_idx]
    
        if print_cor:
            for i in range(10):
                
                feature_train_0 = features.xs(i, level='stock').iloc[x_begin_idx:x_end_idx]
                print(feature_train_0.corrwith(response[i]))
                print(feature_train_0.isnull().sum())

        return feature_df_dropped, response.iloc[y_begin_idx:]

In [71]:
grp_idx = {i: [i] for i in range(10)}

x_begin_idx = 41
x_end_idx = -30
y_begin_idx = 71

train_split_t = log_pr.index[-87841]
#vali_split_t = log_pr.index[-44641]

train_feature_dict = get_feature_train(log_pr[:train_split_t], volu[:train_split_t], x_begin_idx, x_end_idx, y_begin_idx,
                                        grp_idx=grp_idx, print_cor=False)

test_feature_dict = get_feature_train(log_pr[train_split_t:], volu[train_split_t:], x_begin_idx, x_end_idx, y_begin_idx,
                                        grp_idx=grp_idx,print_cor=False)

## Modelling

In [4]:
from sklearn.metrics import mean_squared_error
from statsmodels.regression.linear_model import OLS

### Feature Selection with AIC

In [171]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import statsmodels.api as sm
import itertools as it

def forward_regression(X, y):
    '''
    Input
    X,y: training matrix (without intercept)
    Return
    model: ols model fitted with intercept on the selected features
    feature_selected: selected features
    '''
    initial_list = []
    included = list(initial_list)
    feature_num = 5#len(X.columns)
    best_bics = pd.Series(index={i for i in range(feature_num)})
    best_features = list(it.repeat([],feature_num))
    for k in range(feature_num):
        excluded = list(set(X.columns)-set(included))
        new_bic = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_bic[new_column] = model.bic
        best_bic = new_bic.min()
        best_bics[k] = best_bic
        best_feature = new_bic.idxmin()
        included.append(best_feature)
        best_features[k] = included.copy()
    feature_selected = best_features[best_bics.idxmin()]
    model = sm.OLS(y,X[feature_selected]).fit() #sm.add_constant(pd.DataFrame(X[feature_selected]))).fit()
    return model,feature_selected

In [172]:
xtrain0, ytrain0 = train_feature_dict[2]
#xtest0, ytest0 = test_feature_dict[0]
ytrain0 = ytrain0.set_index(xtrain0.index)
#ytest0 = ytest0.set_index(xtest0.index)
reg0,feature0 = forward_regression(xtrain0,ytrain0)
reg0.summary()

0,1,2,3
Dep. Variable:,2,R-squared (uncentered):,0.004
Model:,OLS,Adj. R-squared (uncentered):,0.004
Method:,Least Squares,F-statistic:,148.3
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,1.17e-157
Time:,16:30:58,Log-Likelihood:,592190.0
No. Observations:,177049,AIC:,-1184000.0
Df Residuals:,177044,BIC:,-1184000.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
rsi_30_2,-4.871e-05,2.55e-06,-19.084,0.000,-5.37e-05,-4.37e-05
log_pr_std_30_2,0.0608,0.009,6.888,0.000,0.044,0.078
volu_z_score_2,0.0001,2.09e-05,5.227,0.000,6.83e-05,0.000
pr_min_40_2,-0.0010,5.56e-05,-17.935,0.000,-0.001,-0.001
log_volu_2,0.0003,1.31e-05,19.992,0.000,0.000,0.000

0,1,2,3
Omnibus:,93540.287,Durbin-Watson:,0.074
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2311386.103
Skew:,2.036,Prob(JB):,0.0
Kurtosis:,20.226,Cond. No.,22400.0


In [173]:
feature0

['rsi_30_2', 'log_pr_std_30_2', 'volu_z_score_2', 'pr_min_40_2', 'log_volu_2']

In [148]:
xtrain0.columns

Index(['log_pr_20_0', 'log_pr_30_0', 'pr_min_40_0', 'pr_max_40_0',
       'pr_so_40_0', 'pr_so_40d3_0', 'log_pr_std_30_0', 'rsi_30_0',
       'log_volu_0', 'volu_z_score_0'],
      dtype='object')

### Train function OLS

In [72]:
def train_OLS(feature_dict):
    mod_dict = {}
    for i, (X, y) in feature_dict.items():
        mod_dict[i] = OLS(y.values, X.values).fit()

    return mod_dict

mod_dict = train_OLS(train_feature_dict)

### Train function Subseted

In [176]:
def train_subOLS(feature_dict):
    mod_dict = {}
    feature_dict = {}
    for i, (X, y) in feature_dict.items():
        print(X)
        model, feature = forward_regression(y,X)
        mod_dict[i] = model
        feature_dict[i] = feature
    return mod_dict,feature_dict

mod_dict, fea_dict = train_subOLS(train_feature_dict)

{}

### Save Model

In [None]:
import pickle
with open("models.pckl", "wb") as f:
    for model in mod_dict.values():
        pickle.dump(model, f)

### Evaluation

In [74]:
def wide_format_test(df):
    df_= df.reset_index()
    df_ = df_.pivot(columns ='index').apply(lambda s: s.dropna().reset_index(drop=True))
    df_.columns = df_.columns.get_level_values(0) + '_' +  [str(x) for x in df_.columns.get_level_values(1)]

    return df_

def get_feature_test(log_pr, volu, grp_idx=None):
    """
    Input: 
    log_pr (pdSeries): 1 day of log pr 
    volu (pdSeries): 1 day of volume

    Output:
    test data frame
    """
    features = pd.DataFrame(index=log_pr.columns)

    # backward return
    # print(-(log_pr.iloc[-1] - log_pr.iloc[-30]).values)
    for i in [10, 20, 30]:
        features['log_pr_{}'.format(i)] = -(log_pr.iloc[-1] - log_pr.iloc[-i]).values
    # backward rolling std
    features['log_pr_std_10'] = log_pr.iloc[-10:].std(0).values
    
    # volume features
    features['log_volu'] = np.log(volu.iloc[-1].values + 1)
    features['volu_z_score'] = ((volu.iloc[-1] - volu.iloc[-240:].mean())/volu.iloc[-240:].std()).values

    if grp_idx is None:
        return wide_format_test(features)
    else:
        df_dict = {}
        for key, idx_lis in grp_idx.items():
            df_dict[key] = wide_format_test(features.loc[idx_lis])
        return df_dict

In [75]:
model_dict = mod_dict #{i: pickle.load(open('../model/ridge{}.sav'.format(i), 'rb')) for i in range(2)}

def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    grp_idx = {i: [i] for i in range(10)}
    x = get_feature_test(A, B, grp_idx=grp_idx)
    pred_dict = {i: model.predict(x[i]) for i, model in model_dict.items()}
    
    out = np.zeros(10)
    for keys, idx in grp_idx.items():
        out[idx] = pred_dict.get(keys)
    return out

In [76]:
def evaluate_tune(log_pr_test, volu_test):

    t0 = time.time()
    dt = datetime.timedelta(days=1)

    r_fwd = (log_pr_test.shift(-30) - log_pr_test).iloc[1440::10]
    # r_fwd = return_true.iloc[1440::10]
    # r_fwd.index = log_pr_test.index[1440::10]
    r_hat = pd.DataFrame(index=log_pr_test.index[1440::10], columns=log_pr_test.columns, dtype=np.float64)

    for t in log_pr_test.index[1440::10]: # compute the predictions every 10 minutes
        # inputs 1 day of log price and volume
        r_hat.loc[t, :] = get_r_hat(log_pr_test.loc[(t - dt):t], volu_test.loc[(t - dt):t])
    t_used = time.time() - t0
    print("Time used: ", t_used)

    r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
    r_hat_all = r_hat.iloc[:-3].values.ravel()
    return r_fwd.corrwith(r_hat), np.corrcoef(r_fwd_all, r_hat_all)[0,1]

In [78]:
log_pr_test = log_pr[:train_split_t]
volu_test = volu[:train_split_t]

In [None]:
evaluate_tune(log_pr_test, volu_test)

In [77]:
log_pr_test = log_pr
volu_test = volu

In [None]:
evaluate_tune(log_pr_test, volu_test)