In [1]:
import sys
sys.path.insert(1, '../')
sys.path.insert(1, '../regressions/')
sys.path.insert(1, '../tools/')

import pandas as pd
import numpy as np
import datetime as dt
from importlib import reload
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import ElasticNet
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

plt.style.use('ggplot')
%matplotlib inline

import TSManager as ts

In [2]:
df = pd.read_pickle('../pkl_data/train_prepared.pkl')
df.head()

Unnamed: 0_level_0,IC,VRP,BDI,SI,DP,PE,BM,CAPE,PCAPrice,BY,...,TERM,CAY,SIM,NOS,CPI,PCR,MA,OIL,SPX,RF
AsOfDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-06-08,,0.737088,-0.152808,,0.063979,7.524271,,8.978524,3.616905,1.007418,...,0.5,-0.013576,0.715385,0.016941,0.046737,-1.622923,1.0,,-0.012302,
1990-06-11,,-0.853208,-0.163823,,0.063463,7.597792,,9.051612,3.583706,1.009642,...,0.49,-0.013576,0.707692,0.016941,0.046737,-1.615366,1.0,,0.008107,
1990-06-12,,-0.057387,-0.171766,,0.062662,7.699009,,9.167251,3.53272,1.00958,...,0.49,-0.013576,0.7,0.016941,0.046737,-1.618868,1.0,,0.012695,
1990-06-13,,1.332508,-0.180577,,0.062894,7.674771,,9.13346,3.547245,1.000056,...,0.47,-0.013576,0.692308,0.016941,0.046737,-1.625316,1.0,,-0.003693,
1990-06-14,,1.20454,-0.189466,,0.063241,7.636994,,9.0834,3.569002,0.997689,...,0.46,-0.013576,0.684615,0.016941,0.046737,-1.620454,1.0,,-0.005496,


In [3]:
x_cols = [x for x in df.columns if x not in ['SPX','RF']]

In [4]:
# constructing the target label
df['spx_tp130'] = df['SPX'].rolling(130).sum().shift(-129)
df = df.dropna(subset=['spx_tp130'])

## Program Parameters

In [5]:
LOOKBACK_YEARS = 10  # when regressing, how far back to look in years
VALIDATE_DAYS = 20  # when regressing, the number of days to hold parameters constant
MIN_X_THRESH = 0.5  # when training, drop x that is too sparse, otherwise median-fill after normalization

In [6]:
# normalization pipeline
scaler = StandardScaler()
si = SimpleImputer(strategy='median')
pipe = make_pipeline(scaler, si)

In [7]:
all_x = df[x_cols]
all_y = df['spx_tp130']
all_x.shape, all_y.shape

((6146, 19), (6146,))

## Function for scoring elastic net hparams:

we will compute the total SSE on the test 20 days for all train-test pairs, and use that as our parameter selection metric

In [8]:
def test_en_hparams(h_alpha, h_l1_ratio):
    start_i = LOOKBACK_YEARS * 252
    end_i = all_x.shape[0] - VALIDATE_DAYS
    tot_sse = 0.
    
    for i in tqdm(range(start_i, end_i + 1)):
        # forming the training period of n years of look back
        fit_start = i - LOOKBACK_YEARS * 252
        fit_end = i
        
        # forming the test period of just 20 days
        test_start = i
        test_end = i + 20
        
        cur_train = all_x[fit_start:fit_end]
        cur_test = all_x[test_start:test_end]
        cur_y = all_y[fit_start:fit_end]
        cur_y_test = all_y[test_start:test_end]
        
        # drop columns in train that do note have enough obs:
        cur_train = cur_train.dropna(axis=1, thresh=int(MIN_X_THRESH * cur_train.shape[0]))
        cur_test = cur_test[cur_train.columns]
        
        # pipe to norm/impute
        cur_train_norm = pipe.fit_transform(cur_train)
        cur_test_norm = pipe.fit_transform(cur_test)
        
        en = ElasticNet(alpha=h_alpha, l1_ratio=h_l1_ratio)
        en.fit(cur_train_norm, cur_y)
        
        # ** we will compute the total SSE on the test 20 days and use that as our parameter selection
        test_preds = en.predict(cur_test_norm)
        cur_sse = ((test_preds - cur_y_test.values) ** 2).sum()
        tot_sse += cur_sse
        
    return tot_sse

## Grid Search

In [9]:
try_alphas = [1.0, 0.1, 0.01, 0.001]
try_l1_ratios = [0.25, 0.5, 0.75]
res_dict = {}

for h_alpha in try_alphas:
    for h_l1_ratio in try_l1_ratios:
        res_dict[(h_alpha, h_l1_ratio)] = test_en_hparams(h_alpha, h_l1_ratio)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=3607.0), HTML(value='')))

  updated_mean = (last_sum + new_sum) / updated_sample_count
  new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count


ValueError: shapes (20,17) and (18,) not aligned: 17 (dim 1) != 18 (dim 0)

In [None]:
res_dict

In [None]:
used_alpha, used_l1_ratio = 0.001, 0.25

## Loop to predict the forward returns:

In [None]:
# actual returns (log returns)
all_spx = df['SPX']

In [None]:
start_i = LOOKBACK_YEARS * 252
i = start_i

all_preds = {}

while i < all_x.shape[0]:
    fit_start = i - LOOKBACK_YEARS * 252
    fit_end = i
    cur_train = all_x[fit_start:fit_end]
    cur_idx = cur_train.index[-1]
    cur_y = all_y[fit_start:fit_end]
    
    # drop cols that do not have sufficient data
    cur_train = cur_train.dropna(axis=1, thresh=int(MIN_X_THRESH * cur_train.shape[0]))
    
    # normalization
    cur_train_norm = pipe.fit_transform(cur_train)
    cur_target = cur_train_norm[-1:]
    
    en = ElasticNet(alpha=used_alpha, l1_ratio=used_l1_ratio, max_iter=10000)
    en.fit(cur_train_norm, cur_y)
    pred = en.predict(cur_target)
    all_preds[cur_idx] = pred
    
    i += 20

In [None]:
df_preds = pd.DataFrame(all_preds, index=['predicted_spx130']).T

In [None]:
pmean, pstd = df_preds['predicted_spx130'].mean(), df_preds['predicted_spx130'].std()
pmean, pstd

In [None]:
df_preds['pred_z'] = (df_preds['predicted_spx130'] - pmean) / pstd
df_preds['position_unclipped'] = df_preds['pred_z'] / 2
df_preds['position'] = df_preds['position_unclipped'].clip(-0.5, 1.5)
df_preds.head()

In [None]:
df_preds_re = df_preds[['position']].resample('D').ffill()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(df_preds_re['position'])
plt.title('SPX position as suggested by Elastic Net Model')
plt.axhline(0, linestyle='--', color='k')
plt.show()

## Evaluating Portfolio

In [None]:
df_eval = df_preds_re.join(all_spx).dropna()

df_eval['buy_hold'] = np.exp(df_eval['SPX'].cumsum())
df_eval['strat_rt_daily'] = 1 + (df_eval['position'] * (np.exp(df_eval['SPX']) - 1))
df_eval['strat_rti'] = df_eval['strat_rt_daily'].cumprod()

df_eval.head()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(df_eval['buy_hold'], label='Buy-and-Hold')
plt.plot(df_eval['strat_rti'], label='Elastic Net Strategy')
plt.title('Value of $1 in Elastic Net Strategy vs Buy-and-Hold')
plt.legend()
plt.show()