# Linear Model with AIC feature engineering

The linear method makes use of the log backward return (log price difference) to predict foward return, thus decide log price.

Training:
1) Ridge regression: on 30 features
2) PC regression: pca on 30 features then perform ols

Feature: 10 stocks, each with 3 backward return (say, 3min, 7min, 10min, see rolling cross cor)

Response: do 10 regression, each for 1 stock. 

## Data Preparation

In [1]:
import os
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

log_pr = pd.read_pickle("../data/log_price.df")
volu = pd.read_pickle("../data/volume_usd.df")

In [10]:
# Data Preparation
def logr(dta,min):
    '''
    Input
    dta: pandas dataframe nxp
    min: backward length
    Return
    log return: pandas dataframe
    '''
    return dta.diff(min,0).dropna()

In [113]:
x_begin_idx = log_pr.index[30]
x_end_idx = log_pr.index[-31]
y_begin_idx = log_pr.index[60]

x = pd.concat((logr(log_pr,2)[x_begin_idx:x_end_idx],
    logr(log_pr,3)[x_begin_idx:x_end_idx],
    logr(log_pr,7)[x_begin_idx:x_end_idx]),axis=1)
y = log_pr.diff(30)[y_begin_idx:]
y = y.set_index(x.index)

xtrain,xvali,xtest = x[:log_pr.index[-87841]],x[log_pr.index[-87840]:log_pr.index[-44641]],x[log_pr.index[-44640]:]
ytrain,yvali,ytest= y[:log_pr.index[-87841]],y[log_pr.index[-87840]:log_pr.index[-44641]],y[log_pr.index[-44640]:]

In [67]:
# preprocessing
def remove_outliers(dta):
    # Compute the mean and interquartile range
    mean = dta.mean(0)
    iqr = dta.quantile([0.25, 0.75], axis=0).diff().T.iloc[:, 1]
    # Replace entries that are more than 10 times the IQR
    # away from the mean with NaN (denotes a missing entry)
    mask = np.abs(dta) > mean + 10 * iqr
    treated = dta.copy()
    treated[mask] = np.nan
    return treated

## Ridge Regression

### Model Fitting

In [114]:
from sklearn.linear_model import Ridge

rr = Ridge(alpha=1).fit(xtrain,ytrain)

In [122]:
yvali

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-11-01 00:00:00,-0.000080,-0.018890,-0.004244,0.003424,-0.001426,-0.000447,-0.000266,0.001331,0.000045,0.000003
2021-11-01 00:01:00,-0.001046,-0.017175,-0.005742,0.003257,-0.000458,-0.000348,-0.000544,0.001474,-0.000374,-0.000803
2021-11-01 00:02:00,-0.001203,-0.013807,-0.003393,0.002812,0.000897,-0.001060,-0.000253,0.000604,-0.002556,-0.002298
2021-11-01 00:03:00,-0.000013,-0.006774,-0.003745,0.003471,0.000641,-0.001734,-0.000300,0.000494,-0.004369,-0.003059
2021-11-01 00:04:00,0.002705,-0.008016,-0.004659,0.004349,0.001408,-0.002230,0.000281,0.000079,-0.007239,-0.004448
...,...,...,...,...,...,...,...,...,...,...
2021-11-30 23:55:00,0.002072,-0.001540,0.010375,0.000360,0.005542,-0.002265,0.001956,-0.001994,0.000370,0.007541
2021-11-30 23:56:00,0.002903,-0.000311,0.009662,0.000788,0.005605,-0.002692,0.002851,-0.001979,0.000764,0.008665
2021-11-30 23:57:00,0.002459,-0.000767,0.009689,0.004306,0.006827,-0.001337,0.003033,-0.001348,0.000664,0.008521
2021-11-30 23:58:00,0.003693,-0.001451,0.012513,0.002922,0.006866,-0.000853,0.002492,-0.001700,0.001084,0.009537


In [127]:
pd.DataFrame(rr.predict(xvali),columns=[str(i) for i in range(10)],index=yvali.index)[::10].corrwith(yvali[::10])

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
8   NaN
9   NaN
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
8   NaN
9   NaN
dtype: float64

### Evaluation

In [None]:
def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    x_begin_idx = log_pr.index[30]
    x_end_idx = log_pr.index[-31]
    y_begin_idx = log_pr.index[60]

    x = pd.concat((logr(log_pr,2)[x_begin_idx:x_end_idx],
        logr(log_pr,3)[x_begin_idx:x_end_idx],
        logr(log_pr,7)[x_begin_idx:x_end_idx]),axis=1)
    y = log_pr.diff(30)[y_begin_idx:]
    y = y.set_index(x.index)
    return 

def evaluate(log_pr_test, volu_test):
    t0 = time.time()
    dt = datetime.timedelta(days=1)

    r_fwd = (log_pr_test.shift(-30) - log_pr_test).iloc[1440::10]
    r_hat = pd.DataFrame(index=log_pr_test.index[1440::10], columns=np.arange(10), dtype=np.float64)

    for t in log_pr_test.index[1440::10]: # compute the predictions every 10 minutes
        # inputs 1 day of log price and volume
        r_hat.loc[t, :] = get_r_hat(log_pr_test.loc[(t - dt):t], volu_test.loc[(t - dt):t])
    t_used = time.time() - t0
    print("Time used: ", t_used)

    r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
    r_hat_all = r_hat.iloc[:-3].values.ravel()
    return np.corrcoef(r_fwd_all, r_hat_all)[0,1]

In [None]:
t0 = time.time()
dt = datetime.timedelta(days=1)

r_fwd = yval
r_hat = rr.predict(xval)

for t in log_pr_test.index[1440::10]: # compute the predictions every 10 minutes
    # inputs 1 day of log price and volume
    r_hat.loc[t, :] = get_r_hat(log_pr_test.loc[(t - dt):t], volu_test.loc[(t - dt):t])
t_used = time.time() - t0
print("Time used: ", t_used)

r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
r_hat_all = r_hat.iloc[:-3].values.ravel()
return np.corrcoef(r_fwd_all, r_hat_all)[0,1]

## PC Regression

In [None]:
# pca_trainx
# pca_trainy

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression

pca = PCA()
pca_trainx = pca.fit_transform(scale(trainx))

lr = LinearRegression().fit(pca_trainx,pca_trainy)


## Evaluation