# Linear Model with AIC feature engineering

The linear method makes use of the log backward return (log price difference) to predict foward return, thus decide log price.

Training:
1) Ridge regression: on 30 features
2) PC regression: pca on 30 features then perform ols

Feature: 10 stocks, each with 3 backward return (say, 3min, 7min, 10min, see rolling cross cor)

Response: do 10 regression, each for 1 stock. 

## Data Preparation

In [1]:
import os
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

log_pr = pd.read_pickle("../data/log_price.df")
volu = pd.read_pickle("../data/volume_usd.df")

In [2]:
# Data Preparation
def logr(dta,min):
    '''
    Input
    dta: pandas dataframe nxp
    min: backward length
    Return
    log return: pandas dataframe
    '''
    return dta.diff(min,0).dropna()

In [3]:
x_begin_idx = log_pr.index[30]
x_end_idx = log_pr.index[-31]
y_begin_idx = log_pr.index[60]

x = pd.concat((logr(log_pr,2)[x_begin_idx:x_end_idx],
    logr(log_pr,3)[x_begin_idx:x_end_idx],
    logr(log_pr,7)[x_begin_idx:x_end_idx]),axis=1)
y = log_pr.diff(30)[y_begin_idx:]
y = y.set_index(x.index)

xtrain,xvali,xtest = x[:log_pr.index[-87841]],x[log_pr.index[-87840]:log_pr.index[-44641]],x[log_pr.index[-44640]:]
ytrain,yvali,ytest= y[:log_pr.index[-87841]],y[log_pr.index[-87840]:log_pr.index[-44641]],y[log_pr.index[-44640]:]

In [4]:
# preprocessing
def remove_outliers(dta):
    # Compute the mean and interquartile range
    mean = dta.mean(0)
    iqr = dta.quantile([0.25, 0.75], axis=0).diff().T.iloc[:, 1]
    # Replace entries that are more than 10 times the IQR
    # away from the mean with NaN (denotes a missing entry)
    mask = np.abs(dta) > mean + 10 * iqr
    treated = dta.copy()
    treated[mask] = np.nan
    return treated

## Ridge Regression

### Model Fitting

In [5]:
from sklearn.linear_model import Ridge

rr = Ridge(alpha=1).fit(xtrain,ytrain)

In [20]:
# 10-minute-rolling 30-min log-return prediction for validation set
yvali_hat = pd.DataFrame(rr.predict(xvali),columns={i for i in range(10)},index=yvali.index)[::10]
# compute pairwise correlation

In [21]:
yvali_hat.corrwith(yvali[::10])

0    0.034817
1    0.079437
2    0.002021
3    0.037741
4    0.022657
5    0.003297
6    0.034381
7    0.006009
8    0.031586
9    0.025863
dtype: float64

### Evaluation

In [7]:
def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    x_begin_idx = log_pr.index[30]
    x_end_idx = log_pr.index[-31]
    y_begin_idx = log_pr.index[60]

    x = pd.concat((logr(log_pr,2)[x_begin_idx:x_end_idx],
        logr(log_pr,3)[x_begin_idx:x_end_idx],
        logr(log_pr,7)[x_begin_idx:x_end_idx]),axis=1)
    y = log_pr.diff(30)[y_begin_idx:]
    y = y.set_index(x.index)
    return 

t0 = time.time()
dt = datetime.timedelta(days=1)
r_hat = pd.DataFrame(index=log_pr.index[30::10], columns=np.arange(10), dtype=np.float64)
for t in log_pr.index[30::10]: # compute the predictions every 10 minutes
    r_hat.loc[t, :] = get_r_hat(log_pr.loc[(t - dt):t], volu.loc[(t - dt):t])
t_used = time.time() - t0
print(t_used)
r_hat

# Compute true forward log_returns every 10 minutes
r_fwd = (log_pr.shift(-30) - log_pr).iloc[30::10].rename(columns={f"log_pr_{i}": i for i in range(10)})

# Correlation for every asset

r_fwd.corrwith(r_hat)

# Overall correlation (The ranking is based on this metric on the testing dataset)

r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
r_hat_all = r_hat.iloc[:-3].values.ravel()
np.corrcoef(r_fwd_all, r_hat_all)[0, 1]

KeyboardInterrupt: 

## PC Regression

In [None]:
# pca_trainx
# pca_trainy

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression

pca = PCA()
pca_trainx = pca.fit_transform(scale(trainx))

lr = LinearRegression().fit(pca_trainx,pca_trainy)


## Evaluation