# OLS

The linear method makes use of the log backward return (log price difference) to predict foward return.

Training:
1) Ridge regression: on 30 features
2) PC regression: pca on 30 features then perform ols

Feature: 10 stocks, each with 3 backward return (say, 3min, 7min, 10min, see correlation to decide)

Response: 10 stocks' 30min forward return. 

Groups: [1,3,5,9],[2,4,7],[0],[3],[9]

## Data Preparation

In [2]:
import os
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

log_pr = pd.read_pickle("../data/log_price.df")
volu = pd.read_pickle("../data/volume_usd.df")

In [3]:
def wide_format(df):
    df_= df.reset_index(level=['stock']).sort_index()
    df_ = df_.pivot(columns ='stock')
    df_.columns = df_.columns.get_level_values(0) + '_' +  [str(x) for x in df_.columns.get_level_values(1)]

    return df_


def get_feature_train(log_pr, volu, x_begin_idx, x_end_idx, y_begin_idx, 
                        grp_idx=None, rm_outlier=False, print_cor=True):
    """
    Input:
    log_pr (pdSeries): train set
    volu (pdSeries): train set
    x_begin_idx (pdIndex): to truncate the NaNs
    grp_idx (dict): key is group idx, value is list of stock idx

    Returns:
    feature_dict (dict): key is group idx, value is a tuple of feature matrix and response
    """

    log_pr_df = log_pr.reset_index().melt(id_vars=['timestamp'])
    log_pr_df.columns = ['timestamp', 'stock', 'log_pr']
    log_pr_df = log_pr_df.set_index(['timestamp', 'stock']).sort_index()

    volu_df = volu.reset_index().melt(id_vars=['timestamp'])
    volu_df.columns = ['timestamp', 'stock', 'volu']
    volu_df = volu_df.set_index(['timestamp', 'stock']).sort_index()

    features = pd.DataFrame(index=log_pr_df.index)
    # log_pr feature
    for i in [10, 20, 30]:
        features['log_pr_{}'.format(i)] = -log_pr_df.groupby(level='stock').log_pr.diff(i)

    std_10 = lambda x: x.rolling(10).std()
    features['log_pr_std_10'] = log_pr_df.groupby(level='stock').log_pr.apply(std_10)

    # volume feature
    log_fn = lambda x: np.log(x+1)
    features['log_volu'] = volu_df.groupby(level='stock').volu.apply(log_fn)

    # stdised volume in 2 hours backward rolling windows
    zscore_fn = lambda x: (x - x.rolling(window=240, min_periods=20).mean()) / x.rolling(window=240, min_periods=20).std()
    features['volu_z_score'] = volu_df.groupby(level='stock').volu.apply(zscore_fn)

    # feature_dropped = features.iloc[x_begin_idx:x_end_idx]
    response = log_pr.diff(30)

    if grp_idx is not None:
        feature_dict = {}
        for key, idx_lis in grp_idx.items():
            feature_df_dropped = wide_format(features.loc[pd.IndexSlice[:,idx_lis],:])
            # transform back to wide format
            feature_dict[key] = (feature_df_dropped.iloc[x_begin_idx:x_end_idx], 
                                            response[idx_lis].iloc[y_begin_idx:])
        return feature_dict
    else:
        # transform back to wide format
        feature_df_dropped = wide_format(features).iloc[x_begin_idx:x_end_idx]
        # feature_df_dropped = feature_df[x_begin_idx:x_end_idx]
    
        if print_cor:
            for i in range(10):
                feature_train_0 = features.xs(i, level='stock')
                print(feature_train_0.corrwith(response[i]))

        return feature_df_dropped, response.iloc[y_begin_idx:]

In [4]:
grp_idx = {i: [i] for i in range(10)}

x_begin_idx = 30
x_end_idx = -30
y_begin_idx = 60

train_split_t = log_pr.index[-87841]
vali_split_t = log_pr.index[-44641]

train_feature_dict = get_feature_train(log_pr[:train_split_t], volu[:train_split_t], x_begin_idx, x_end_idx, y_begin_idx,
                                        grp_idx=grp_idx, print_cor=False)

vali_feature_dict = get_feature_train(log_pr[train_split_t:], volu[train_split_t:], x_begin_idx, x_end_idx, y_begin_idx,
                                        grp_idx=grp_idx,print_cor=False)

### Modelling

In [5]:
from sklearn.metrics import mean_squared_error
from statsmodels.regression.linear_model import OLS

#### Train Ridge Model 1

In [6]:
xtrain0, ytrain0 = train_feature_dict[0]
xvali0, yvali0 = vali_feature_dict[0]

In [7]:
xtrain3, ytrain3 = train_feature_dict[3]
xvali3, yvali3 = vali_feature_dict[3]

### Fit OLS with current features

In [8]:
reg0 = OLS(ytrain0.values, xtrain0.values).fit()

In [9]:
reg3 = OLS(ytrain3.values, xtrain3.values).fit()

In [10]:
reg0.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.002
Model:,OLS,Adj. R-squared (uncentered):,0.002
Method:,Least Squares,F-statistic:,47.58
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,1.17e-58
Time:,09:29:15,Log-Likelihood:,665080.0
No. Observations:,177060,AIC:,-1330000.0
Df Residuals:,177054,BIC:,-1330000.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0029,0.006,0.518,0.604,-0.008,0.014
x2,0.0063,0.006,1.116,0.264,-0.005,0.017
x3,0.0360,0.004,8.861,0.000,0.028,0.044
x4,0.0544,0.016,3.404,0.001,0.023,0.086
x5,-8.123e-06,1.82e-06,-4.470,0.000,-1.17e-05,-4.56e-06
x6,3.569e-06,1.27e-05,0.281,0.778,-2.13e-05,2.84e-05

0,1,2,3
Omnibus:,86762.629,Durbin-Watson:,0.077
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2607255.704
Skew:,1.769,Prob(JB):,0.0
Kurtosis:,21.463,Cond. No.,14900.0


In [11]:
reg3.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.004
Model:,OLS,Adj. R-squared (uncentered):,0.004
Method:,Least Squares,F-statistic:,124.4
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,1.3500000000000002e-157
Time:,09:29:15,Log-Likelihood:,683860.0
No. Observations:,177060,AIC:,-1368000.0
Df Residuals:,177054,BIC:,-1368000.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0460,0.005,8.436,0.000,0.035,0.057
x2,0.0233,0.005,4.298,0.000,0.013,0.034
x3,0.0300,0.004,7.550,0.000,0.022,0.038
x4,0.1576,0.016,9.959,0.000,0.127,0.189
x5,-9.002e-06,1.6e-06,-5.627,0.000,-1.21e-05,-5.87e-06
x6,8.337e-06,1.14e-05,0.730,0.465,-1.4e-05,3.07e-05

0,1,2,3
Omnibus:,93860.98,Durbin-Watson:,0.076
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2989866.044
Skew:,1.965,Prob(JB):,0.0
Kurtosis:,22.744,Cond. No.,16800.0


In [12]:
mean_squared_error(ytrain0, reg0.predict())

3.198004176691234e-05

In [13]:
mean_squared_error(ytrain3, reg3.predict())

2.58681160398662e-05

In [14]:
mean_squared_error(yvali0, reg0.predict(xvali0))

4.055614327929686e-05

In [15]:
mean_squared_error(yvali3, reg3.predict(xvali3))

1.969906945327399e-05

In [19]:
np.corrcoef(yvali3.squeeze(), reg3.predict(xvali3).squeeze())

array([[1.        , 0.00225744],
       [0.00225744, 1.        ]])

In [20]:
np.corrcoef(yvali0.squeeze(), reg0.predict(xvali0).squeeze())

array([[ 1.        , -0.00256128],
       [-0.00256128,  1.        ]])

### Train function

In [None]:
def train_OLS(feature_dict):
    mod_dict = {}
    for i, (X, y) in feature_dict.items():
        mod_dict[i] = OLS(y, X).fit()

    return mod_dict

In [None]:
mod_dict = train_OLS(train_feature_dict)

#### Save Model

In [180]:
import pickle
with open("models.pckl", "wb") as f:
    for model in mod_dict.values():
        pickle.dump(model, f)

### Evaluation

In [166]:
def wide_format_test(df):
    df_= df.reset_index()
    df_ = df_.pivot(columns ='index').apply(lambda s: s.dropna().reset_index(drop=True))
    df_.columns = df_.columns.get_level_values(0) + '_' +  [str(x) for x in df_.columns.get_level_values(1)]

    return df_

def get_feature_test(log_pr, volu, grp_idx=None):
    """
    Input: 
    log_pr (pdSeries): 1 day of log pr 
    volu (pdSeries): 1 day of volume

    Output:
    test data frame
    """
    features = pd.DataFrame(index=log_pr.columns)

    # backward return
    # print(-(log_pr.iloc[-1] - log_pr.iloc[-30]).values)
    for i in [10, 20, 30]:
        features['log_pr_{}'.format(i)] = -(log_pr.iloc[-1] - log_pr.iloc[-i]).values
    # backward rolling std
    features['log_pr_std_10'] = log_pr.iloc[-10:].std(0).values
    
    # volume features
    features['log_volu'] = np.log(volu.iloc[-1].values + 1)
    features['volu_z_score'] = ((volu.iloc[-1] - volu.iloc[-240:].mean())/volu.iloc[-240:].std()).values

    if grp_idx is None:
        return wide_format_test(features)
    else:
        df_dict = {}
        for key, idx_lis in grp_idx.items():
            df_dict[key] = wide_format_test(features.loc[idx_lis])
        return df_dict


In [167]:
model_dict = {i: pickle.load(open('../model/ridge{}.sav'.format(i), 'rb')) for i in range(2)}

def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    grp_idx = {0:[1,5,6,8], 1:[0,2,3,4,7,9]}
    x = get_feature_test(A, B, grp_idx=grp_idx)
    pred_dict = {i: model.predict(x[i]) for i, model in model_dict.items()}
    
    out = np.zeros(10)
    for keys, idx in grp_idx.items():
        out[idx] = pred_dict.get(keys)

    return out

In [168]:
# def get_r_hat_tune(A, B):
#     # grp_idx = {0:[1,5,6,8], 1:[0,2,3,4,7,9]}
#     x = get_feature_test(A, B)
#     return rr.predict(x)

In [169]:
def evaluate_tune(log_pr_test, volu_test):

    t0 = time.time()
    dt = datetime.timedelta(days=1)

    r_fwd = (log_pr_test.shift(-30) - log_pr_test).iloc[1440::10]
    # r_fwd = return_true.iloc[1440::10]
    # r_fwd.index = log_pr_test.index[1440::10]
    r_hat = pd.DataFrame(index=log_pr_test.index[1440::10], columns=log_pr_test.columns, dtype=np.float64)

    for t in log_pr_test.index[1440::10]: # compute the predictions every 10 minutes
        # inputs 1 day of log price and volume
        r_hat.loc[t, :] = get_r_hat(log_pr_test.loc[(t - dt):t], volu_test.loc[(t - dt):t])
    t_used = time.time() - t0
    print("Time used: ", t_used)

    r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
    r_hat_all = r_hat.iloc[:-3].values.ravel()
    return r_fwd.corrwith(r_hat), np.corrcoef(r_fwd_all, r_hat_all)[0,1]

In [171]:
log_pr_test = log_pr
volu_test = volu

In [172]:
evaluate_tune(log_pr_test, volu_test)

Time used:  222.40197610855103


(0    0.056759
 1    0.076502
 2    0.053963
 3    0.056852
 4    0.126005
 5    0.074002
 6    0.079662
 7    0.041148
 8    0.110821
 9    0.090038
 dtype: float64,
 0.07260984465426996)

In [173]:
log_pr_train_vali = log_pr[:train_split_t]
volu_train_vali = volu[:train_split_t]

In [174]:
evaluate_tune(log_pr_train_vali, volu_train_vali)

Time used:  146.94605422019958


(0    0.043323
 1    0.060267
 2    0.048021
 3    0.065409
 4    0.124495
 5    0.073912
 6    0.090818
 7    0.045182
 8    0.106579
 9    0.076561
 dtype: float64,
 0.0659273964246186)