# 5: Generating out-of-sample predictions

## Imports & Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline

from time import time
import sys, os
from pathlib import Path

import pandas as pd
from scipy.stats import spearmanr

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils import MultipleTimeSeriesCV

In [4]:
sns.set_style('whitegrid')

In [5]:
YEAR = 252
idx = pd.IndexSlice

In [6]:
scope_params = ['lookahead', 'train_length', 'test_length']
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
catboost_train_params = ['max_depth', 'min_child_samples']

## Generate LightGBM predictions

### Model Configuration

In [7]:
base_params = dict(boosting='gbdt',
                   objective='regression',
                   verbose=-1)

categoricals = ['year', 'month', 'sector', 'weekday']

In [17]:
lookahead = 1
store = Path('../data/predictions.h5')

### Get Data

In [18]:
data = pd.read_hdf('../data.h5', 'model_data').sort_index()

In [20]:
data.columns

Index(['dollar_vol', 'dollar_vol_rank', 'rsi', 'bb_high', 'bb_low', 'NATR',
       'ATR', 'ppo', 'MACD', 'sector', 'r01', 'r05', 'r10', 'r21', 'r42',
       'r63', 'r01_fwd', 'r05_fwd', 'r21_fwd', 'year', 'month', 'weekday'],
      dtype='object')

In [10]:
labels = sorted(data.filter(like='_fwd').columns)
features = data.columns.difference(labels).tolist()
label = f'r{lookahead:02}_fwd'

In [11]:
data = data.loc[idx[:, '2010':], features + [label]].dropna()

In [12]:
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

In [13]:
lgb_data = lgb.Dataset(data=data[features],
                       label=data[label],
                       categorical_feature=categoricals,
                       free_raw_data=False)

### Generate predictions

In [21]:
lgb_daily_ic = pd.read_hdf('../data/lgbm_tuning.h5', 'lgb/daily_ic')
def get_lgb_params(data, t=5, best=0):
    param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
    df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
    return df.loc[param_cols]

In [15]:
for position in range(7):
    params = get_lgb_params(lgb_daily_ic,
                            t=lookahead,
                            best=position)

    params = params.to_dict()

    for p in ['min_data_in_leaf', 'num_leaves']:
        params[p] = int(params[p])
    train_length = int(params.pop('train_length'))
    test_length = int(params.pop('test_length'))
    num_boost_round = int(params.pop('boost_rounds'))
    params.update(base_params)

    print(f'\nPosition: {position:02}')

    # 1-year out-of-sample period
    n_splits = int(12*YEAR / test_length)
    cv = MultipleTimeSeriesCV(n_splits=n_splits,
                              test_period_length=test_length,
                              lookahead=lookahead,
                              train_period_length=train_length)

    predictions = []
    start = time()
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
        print(i, end=' ', flush=True)
        lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
                                    params=params).construct()
        model = lgb.train(params=params,
                          train_set=lgb_train,
                          num_boost_round=num_boost_round)

        test_set = data.iloc[test_idx, :]
        y_test = test_set.loc[:, label].to_frame('y_test')

        y_pred = model.predict(test_set.loc[:, model.feature_name()])
        predictions.append(y_test.assign(prediction=y_pred))

    if position == 0:
        test_predictions = (pd.concat(predictions)
                            .rename(columns={'prediction': position}))
    else:
        test_predictions[position] = pd.concat(predictions).prediction

by_day = test_predictions.groupby(level='date')
for position in range(5):
    if position == 0:
        ic_by_day = by_day.apply(lambda x: spearmanr(
            x.y_test, x[position])[0]).to_frame()
    else:
        ic_by_day[position] = by_day.apply(
            lambda x: spearmanr(x.y_test, x[position])[0])
print(ic_by_day.describe())
test_predictions.to_hdf(store, f'lgb/test/{lookahead:02}')


Position: 00
---------{dates}
1 ---------{dates}
2 ---------{dates}
3 ---------{dates}
4 ---------{dates}
5 ---------{dates}
6 ---------{dates}
7 ---------{dates}
8 ---------{dates}
9 ---------{dates}
10 ---------{dates}
11 ---------{dates}
12 ---------{dates}
13 ---------{dates}
14 ---------{dates}
15 ---------{dates}
16 ---------{dates}
17 ---------{dates}
18 ---------{dates}
19 ---------{dates}
20 ---------{dates}
21 ---------{dates}
22 ---------{dates}
23 ---------{dates}
24 ---------{dates}
25 ---------{dates}
26 ---------{dates}
27 ---------{dates}
28 ---------{dates}
29 ---------{dates}
30 ---------{dates}
31 ---------{dates}
32 ---------{dates}
33 ---------{dates}
34 ---------{dates}
35 ---------{dates}
36 ---------{dates}
37 ---------{dates}
38 ---------{dates}
39 ---------{dates}
40 ---------{dates}
41 ---------{dates}
42 ---------{dates}
43 ---------{dates}
44 ---------{dates}
45 ---------{dates}
46 ---------{dates}
47 ---------{dates}
48 
Position: 01
---------{dates}
1 --

In [16]:
len(test_predictions.index.get_level_values(0).unique())

58