# Ensemble

In [None]:
#hide 
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
from IPython.display import display, Image

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from lightgbm.sklearn import LGBMRegressor

from skfin.plot import line, bar
from skfin.datasets import load_kf_returns
from skfin.mv_estimators import MeanVariance 
from skfin.backtesting import Backtester
from skfin.metrics import sharpe_ratio
from skfin.estimators import RidgeCV, MultiOutputRegressor, MLPRegressor

returns_data = load_kf_returns(cache_dir='data')
ret = returns_data['Monthly']['Average_Value_Weighted_Returns'][:'1999']

transform_X = lambda x: x.rolling(12).mean().fillna(0).values
transform_y = lambda x: x.shift(-1).values
features = transform_X(ret)
target = transform_y(ret)

## Ensemble 

Rather than choosing a single estimator (or set of parameters) among many, another stategy is to combine all the possible estimators/parameters. `scikit-learn` allows to do that with classes such as `VotingRegressor`.

In [None]:
from skfin.estimators import Ridge
from sklearn.ensemble import VotingRegressor

class VotingRegressor(VotingRegressor):
    def transform(self, X):
        return self.predict(X)

In [None]:
estimators_ = [('ridge1', Ridge(alpha=1)),  
               ('ridge2', Ridge(alpha=100)), ]

By default, `VotingRegressor` applies equal weights across regressors. 

In [None]:
estimator = make_pipeline(StandardScaler(with_mean=False), 
                          MultiOutputRegressor(VotingRegressor(estimators=estimators_)), 
                          MeanVariance())

In [None]:
m = Backtester(estimator, ret).train(features, target)
line(m.pnl_, cumsum=True, title='Voting regressor')

In `scikit-learn`, there is also a `StackingRegressor` but it requires a bit more work to make it work with `MultiOutputRegressor` (and constraints on transform/regressors). 

## Rolling ensemble backtest

In this section, we build a custom ensemble method to learn weights on different estimators from pnls. 

### StackingBacktester

In this section, we consider three estimators: 
    
- the simple Industry momentum. 

- a strategy that learns cross-industry effect with `Ridge`. 

- a strategy that learns cross-industry effect with `Lightgbm`. 

In [None]:
estimators = {'momentum': MeanVariance(), 
              'ridge':  make_pipeline(StandardScaler(with_mean=False), Ridge(), MeanVariance()), 
              'lightgbm': make_pipeline(MultiOutputRegressor(LGBMRegressor(min_child_samples=5, 
                                                             n_estimators=25, n_jobs=1)), MeanVariance())
             }

In [None]:
pnls = pd.concat({k: Backtester(v, ret).train(features, target).pnl_ for k, v in estimators.items()}, axis=1)

In [None]:
pnls_ = pnls.assign(equal_weight = lambda x: x.sum(axis=1).div(np.sqrt(x.shape[1])))
line(pnls_, cumsum=True)

The average correlation is not particularly high, which explains with some simple ensemble seems to help. 

In [None]:
print(f'The average pnl correlation between estimators is {pnls.corr().stack().loc[lambda x: x!=1].mean():.2f}')

We introduce a `StackingBacktester` with the `sklearn` api. 

In [None]:
%%writefile ../skfin/ensemble.py
import numpy as np 
import pandas as pd 
from skfin.mv_estimators import Mbj 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
    

class StackingBacktester:
    def __init__(
        self,
        estimators,
        ret,
        max_train_size=36,
        test_size=1,
        start_date="1945-01-01",
        end_date=None,
        window=60, 
        min_periods=60, 
        final_estimator = Mbj()
    ):

        self.start_date = start_date
        self.end_date = end_date
        self.estimators = estimators
        self.ret = ret[: self.end_date]
        self.cv = TimeSeriesSplit(
            max_train_size=max_train_size,
            test_size=test_size,
            n_splits=1 + len(ret.loc[start_date:end_date]) // test_size,
        )
        self.window = window
        self.min_periods = min_periods
        self.final_estimator = final_estimator 

    def train(self, features, target):
        cols =self.ret.columns 
        idx = self.ret.index[np.concatenate([test for _, test in self.cv.split(self.ret)])]

        _h = {k: [] for k in list(self.estimators.keys()) + ['ensemble']}
        _pnls = {k: [] for k in self.estimators.keys()}
        _coef = []
        for i, (train, test) in enumerate(self.cv.split(self.ret)): 
            h_ = {}
            if (i> self.min_periods): 
                pnl_window = np.stack([np.array(v[-self.window:]) for k, v in _pnls.items()], axis=1)
                coef_ = self.final_estimator.fit(pnl_window).coef_
                _coef += [coef_]
            else: 
                _coef += [np.zeros(3)] 
            for k, m in self.estimators.items(): 
                m.fit(features[train], target[train])
                h_[k] = m.predict(features[test])
                _h[k] += [h_[k]]
                if i+1 <len(idx):
                    _pnls[k] += [self.ret.loc[idx[i+1]].dot(np.squeeze(h_[k]))]
            if (i>self.min_periods): 
                h_ensemble = np.stack([np.squeeze(v) for v in h_.values()], axis=1).dot(coef_).reshape(-1, 1)
                V_ = m.named_steps['meanvariance'].V_
                h_ensemble = h_ensemble / np.sqrt(np.diag(h_ensemble.T.dot(V_.dot(h_ensemble))))
            else: 
                h_ensemble = np.zeros([len(cols), 1])
            _h['ensemble'] += [h_ensemble.T]
            
        self.h_ = {k: pd.DataFrame(np.concatenate(_h[k]), index=idx, columns=cols) 
                   for k in _h.keys()}
        self.pnls_ = pd.concat({k: v.shift(1).mul(self.ret).sum(axis=1)[self.start_date:] 
                                for k, v in self.h_.items()}, 
                               axis=1)
        self.coef_ = pd.DataFrame(np.stack(_coef), index=idx, columns=self.estimators.keys())
        return self

In [None]:
from skfin.mv_estimators import Mbj

In [None]:
m = Mbj()
m.fit(pnls)
bar(pd.Series(m.coef_, index=pnls.columns))

The in-sample optimal weights improve even more the sharpe ratio -- but this is `in-sample`! 

In [None]:
line(pnls_.assign(in_sample_optimal = Mbj().fit_transform(pnls)), cumsum=True)

The `StackingBacktester` computes the performance with the `MBJ` learned weights. 

In [None]:
from skfin.ensemble import StackingBacktester

In [None]:
m = StackingBacktester(estimators=estimators, 
                       ret=ret, window=60,min_periods=60).train(features, target)
pnls = pnls.assign(ensemble_mbj= m.pnls_['ensemble'])

In [None]:
line(m.pnls_['1950-02-01':], cumsum=True)

To understand why the performance is lower, it is useful to look at the weights -- in this case, the weights are often negative. 

In [None]:
line(m.coef_)

We redo the exercise with a positive-weight constraint. 

In [None]:
m = StackingBacktester(estimators=estimators, 
                       final_estimator=Mbj(positive=True), 
                       ret=ret, 
                       window=60,
                       min_periods=60)
m.train(features, target)
pnls['ensemble_mbj_positive'] = m.pnls_['ensemble']

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
line(m.pnls_['1950-02-01':], cumsum=True, ax=ax[0], loc='best')
line(m.coef_, ax=ax[1], loc='best')

Over longer periods with positive constraints, the performance is closer to the industry momentum.

In [None]:
m = StackingBacktester(estimators=estimators, 
                       final_estimator=Mbj(positive=True), 
                       ret=ret, window=180,min_periods=60)

m.train(features, target)
pnls['ensemble_mbj_positive_long_window'] = m.pnls_['ensemble']

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
line(m.pnls_, cumsum=True, ax=ax[0], loc='best')
line(m.coef_, ax=ax[1], loc='best')

Putting the different ensembles, we compare the pnls in the graph below 

In [None]:
line(pnls['1950-02-01':], cumsum=True)