# Training

In [None]:
import qlib
qlib.init(provider_uri = '../data/qlib_day')

## Load the Data

In [None]:
# do not run this unless the first time, use the next cell to load data
import numpy as np
import pandas as pd
feature = pd.read_pickle('../data/intermediate/feature_info/feature_info.pkl')
# label = pd.read_parquet('../data/intermediate/forward_return_1d_close_close_normalized.parquet')
label = pd.read_parquet('../data/intermediate/forward_return_1d_close_close.parquet')
label.index = pd.MultiIndex.from_arrays([
    label.index.get_level_values(0), 
    label.index.get_level_values(1).map(lambda x: x[:6] + ('.SZ' if x.endswith('XSHE') else '.SH'))
])
data = pd.concat(
    [feature, label], 
    axis=1, 
    keys=['feature', 'label'], 
    join='inner',
    sort=True
)
# data.to_parquet('../data/intermediate/feature_info/normalized_dataset.parquet')
data.to_parquet('../data/intermediate/feature_info/dataset.parquet')

In [None]:
import pandas as pd
data = pd.read_parquet('../data/intermediate/feature_info/normalized_dataset.parquet')
label = pd.read_parquet('../data/intermediate/forward_return/1d_vwap_vwap.parquet')
label.columns = ['label']
# data = pd.read_parquet('../data/intermediate/feature_info/dataset.parquet')

## Decriptive Analysis

In [None]:
# feature.loc[:, feature.columns[0]].hist(bins=100)
# corr_ts = feature.groupby(level=0).corr()
# corr_ts_mean = corr_ts.groupby(level=1).mean()
# corr_ts_std = corr_ts.groupby(level=1).std()
# corr_t = corr_ts_mean / corr_ts_std
# corr_t.replace({np.inf: np.nan}, inplace=True)
# corr_t.loc[corr_t.columns].style.background_gradient(cmap='RdYlGn')
# from pandas.plotting import scatter_matrix

# scatter_matrix(feature.loc['2022-01-04', ['feature_56', 'feature_25']], hist_kwds={"bins": 100})

## Construct a DataHandler from Static DataLoader

In [None]:
import numpy as np
from sklearn.decomposition import PCA

from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset.loader import StaticDataLoader
from qlib.data.dataset.processor import Processor

class StaticHandler(DataHandlerLP):
    def __init__(
        self, 
        data,
        instruments = None, 
        start_time = None, 
        end_time = None, 
        drop_raw = False, 
        **kwargs
    ):
        data_loader = StaticDataLoader(config = data)
        super().__init__(
            instruments = instruments, 
            start_time = start_time, 
            end_time = end_time, 
            data_loader = data_loader, 
            drop_raw = drop_raw, 
            **kwargs
        )

class PCADecomp(Processor):
    def __init__(self, fit_start_time, fit_end_time):
        self.fit_start_time = fit_start_time
        self.fit_end_time = fit_end_time
    
    def fit(self, df):
        df = df.loc[self.fit_start_time:self.fit_end_time]
        pca = PCA().fit(df['feature'])
        self.n_comp = (~(np.cumsum(pca.explained_variance_ratio_) > 0.95)).sum()

    def __call__(self, df: pd.DataFrame):
        result = PCA(n_components=self.n_comp).fit_transform(df['feature'])
        result = pd.DataFrame(
            result, index=df.index, 
            columns=[f'new_feature_{i}' for i in range(self.n_comp)]
        )
        result = pd.concat(
            [result, df['label']], 
            axis=1, 
            keys=['feature', 'label']
        )
        return result

In [None]:
handler = StaticHandler(
    data,
    # data.loc[:, (slice(None), selected_features.to_list() + ['label'])], 
    # infer_processors = [PCADecomp("2020-01-01", "2020-12-31")],
)

## Model Construction

### Construct Dataset on StaticHandler

In [None]:
from qlib.data.dataset import DatasetH

dataset = DatasetH(handler=handler, segments={
    "train": ("2018-01-01", "2018-07-31"),
    # "valid": ("2018-08-01", "2018-09-30"),
    # "test": ("2018-10-01", "2018-12-31")
})

In [None]:
# new_data = handler.fetch(data_key=handler.DK_L, col_set='feature')
# new_data_corr = new_data.groupby(level=0).corr()
# new_data_corr_mean = new_data_corr.groupby(level=1).mean()
# new_data_corr_std = new_data_corr.groupby(level=1).std()
# new_data_t = (new_data_corr_mean / new_data_corr_std).replace({np.inf: np.nan})
# new_data_t.loc[new_data_t.columns].style.background_gradient(cmap='RdYlGn')

### Lgbm Model

In [None]:
from qlib.contrib.model.gbdt import LGBModel

evals_result = {}
model = LGBModel(
    loss = "mse",
    learning_rate = 0.01,
    lambda_l1 = 0.01,
    lambda_l2 = 0.001,
    max_depth = 5,
    num_leaves = 100,
    feature_fraction = 1,
    bagging_fraction = 0.72,
    bagging_freq = 10,
    min_data_in_leaf = 100,
)
model.fit(dataset, num_boost_round=1000, evals_result=evals_result)

### Double Ensemble Model

In [None]:
import numpy as np
from qlib.contrib.model.double_ensemble import DEnsembleModel

model = DEnsembleModel(
    base_model='gbm',
    loss='mse',
    num_models=12,
    enable_sr=True,
    enable_fs=True,
    alpha1=1,
    alpha2=1,
    bins_sr=10,
    bins_fs=10,
    decay=0.5,
    sample_ratios=[i for i in np.arange(0.9, -0.1, -0.1)],
    sub_weights=None,
    epochs=100,
)
model.fit(dataset)

### Neural Network Model

In [None]:
from qlib.contrib.model.pytorch_nn import DNNModelPytorch

model = DNN(
    lr = 8e-2,
    lr_decay = 0.3,
    lr_decay_steps = 100,
    optimizer = 'adam',
    max_steps = 4000,
    batch_size = 500,
    GPU = 0,
    weight_decay = 4e-4,
    pt_model_kwargs = {
        'input_dim': 85,
        'layers': (512, 512)
    },
)
model.fit(dataset)

### LSTM Model

In [None]:
from qlib.contrib.model.pytorch_lstm import LSTM

model = LSTM(
    d_feat = 85,
    hidden_size = 512,
    num_layers = 2,
    dropout = 0.95,
    n_epochs = 20,
    lr = 0.001,
    early_stop = 3,
    batch_size = 2000,
    loss = 'mse',
)
model.fit(dataset)

### ALSTM Model

In [None]:
from qlib.contrib.model.pytorch_alstm import ALSTM

model = ALSTM(
    d_feat=85,
    hidden_size=512,
    num_layers=2,
    dropout=0.95,
    n_epochs=20,
    lr=0.001,
    batch_size=2000,
    early_stop=20,
    loss='mse',
    optimizer='adam',
)
model.fit(dataset)

## Rolling Training

In [None]:
import qlib
from pathlib import Path
from qlib.data.dataset import DatasetH
from qlib.contrib.model.gbdt import LGBModel


class RollingTrain:
    def __init__(
        self,
        handler, 
        min_days: int = 20,
        max_days: int = 40,
        valid_days: int = 10,
        pred_days: int = 5,
        exp_name: str = 'lgbm'
    ) -> None:
        """A class used for rolling training
        ------------------------------------

        handler: qlib.data.dataset.handler.DataHandlerLP,
            a data handler constructed by loader
        min_days: int, minimum days in training dataset
        max_days: int, maximum days in training dataset
        valid_days: int, the days contained in valid set
        pred_days: int, the days contained in predict set
        """
        self.handler = handler
        self.min_days = min_days
        self.max_days = max_days
        self.valid_days = valid_days
        self.pred_days = pred_days
        self.exp_name = Path(f'../data/intermediate/results/{exp_name}')
        self.exp_name.mkdir(parents=True, exist_ok=True)

    def rolling(self, model, **kwargs):
        """This method rolls on the datahandler
        ---------------------------------------

        model: a pre-initialized model instance, 
            and the fit, predict method should be implemented
        kwargs: other keyword arguments applies to model.fit method
        """
        datetime_index = self.handler.fetch(data_key='infer', col_set='label').index.levels[0]
        for i, idx in list(enumerate(datetime_index))[::self.pred_days]:
            if i < self.min_days + self.valid_days + self.pred_days - 1:
                continue
            pred_end_idx = i
            pred_start_idx = i - self.pred_days + 1
            valid_end_idx = pred_start_idx - 1
            valid_start_idx = valid_end_idx - self.valid_days + 1
            train_end_idx = valid_start_idx - 1
            train_start_idx = max(min(train_end_idx - self.min_days, train_end_idx - self.max_days), 0)
            
            dataset = DatasetH(handler=self.handler, segments={
                "train": (datetime_index[train_start_idx], datetime_index[train_end_idx]),
                "valid": (datetime_index[valid_start_idx], datetime_index[valid_end_idx]),
                "test": (datetime_index[pred_start_idx], datetime_index[pred_end_idx]),
            })

            model.fit(dataset, **kwargs)
            pred = model.predict(dataset, segment='test')
            label_ = label.loc[pred.index]
            pred_label = pd.concat([pred, label_], axis=1)
            pred_label.columns = ['score', 'label']
            filename = "pred_label_{}_{}".format(
                datetime_index[pred_start_idx].strftime('%Y%m%d'), 
                datetime_index[pred_end_idx].strftime('%Y%m%d')
            )
            pred_label.to_pickle(self.exp_name.joinpath(filename))

### Rolling Training LGBM Model

In [None]:
RollingTrain(
    handler, 
    min_days=100, 
    max_days=120, 
    valid_days=20, 
    pred_days=60,
    exp_name='lgbm'
).rolling(
    LGBModel(
        loss = "mse",
        learning_rate = 0.01,
        lambda_l1 = 0.01,
        lambda_l2 = 0.01,
        max_depth = 20,
        num_leaves = 1024,
        feature_fraction = 1,
        bagging_fraction = 0.72,
        bagging_freq = 10,
        min_data_in_leaf = 100,
    ), num_boost_round = 10000,
)

### Rolling Training Double Ensemble Model

In [None]:
from qlib.contrib.model.double_ensemble import DEnsembleModel

RollingTrain(
    handler, 
    min_days=100, 
    max_days=120, 
    valid_days=20, 
    pred_days=10
).rolling(
    DEnsembleModel(
        base_model='gbm',
        loss='mse',
        num_models=12,
        enable_sr=True,
        enable_fs=True,
        alpha1=1,
        alpha2=1,
        bins_sr=10,
        bins_fs=10,
        decay=0.5,
        sample_ratios=[i for i in np.arange(0.9, -0.1, -0.1)],
        sub_weights=None,
        epochs=1000,
        # model params
        early_stopping_round=50
    )
)

### Rolling Training DNN Model

In [None]:
from qlib.contrib.model.pytorch_nn import DNNModelPytorch

RollingTrain(
    handler, 
    min_days=100, 
    max_days=120, 
    valid_days=20, 
    pred_days=60,
    exp_name='dnn',
).rolling(
    DNNModelPytorch(
        lr = 8e-2,
        lr_decay = 0.3,
        lr_decay_steps = 100,
        optimizer = 'adam',
        max_steps = 4000,
        batch_size = 500,
        GPU = 0,
        weight_decay = 4e-4,
        pt_model_kwargs = {
            'input_dim': 85,
            'layers': (512, 512)
        },
    )
)

### Rolling Training LSTM Model

In [None]:
from qlib.contrib.model.pytorch_lstm import LSTM

RollingTrain(
    handler, 
    min_days=100, 
    max_days=120, 
    valid_days=20, 
    pred_days=60,
    exp_name='lstm'
).rolling(
    model = LSTM(
        d_feat = 85,
        hidden_size = 512,
        num_layers = 2,
        dropout = 0.95,
        n_epochs = 20,
        lr = 0.001,
        early_stop = 3,
        batch_size = 2000,
        loss = 'mse',
    )
)

### Rolling Training ALSTM Model

In [None]:
from qlib.contrib.model.pytorch_alstm import ALSTM

RollingTrain(
    handler, 
    min_days=100, 
    max_days=120, 
    valid_days=20, 
    pred_days=60,
    exp_name='alstm'
).rolling(
    model = ALSTM(
    d_feat=85,
    hidden_size=512,
    num_layers=2,
    dropout=0.95,
    n_epochs=20,
    lr=0.001,
    batch_size=2000,
    early_stop=20,
    loss='mse',
    optimizer='adam',
    )
)

## Parameters Tuning

### Defining a Objective Function

In [None]:
import optuna
from qlib.utils import init_instance_by_config

def objective_lgbm(trial):
    task = {
        "model": {
            "class": "LGBModel",
            "module_path": "qlib.contrib.model.gbdt",
            "kwargs": {
                "loss": "mse",
                "max_depth": trial.suggest_int("max_depth", 1, 36, step=5),
                "num_leaves": trial.suggest_int("num_leaves", 128, 2048, log=True),
                "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
                "lambda_l1": trial.suggest_float("lambda_l1", 1e-3, 1e-2, log=True),
                "lambda_l2": trial.suggest_float("lambda_l2", 1e-3, 1e-2, log=True),
                "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 0.8),
                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 0.8),
                "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
                "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 90, step=10),
            },
        },
    }
    evals_result = dict()
    model = init_instance_by_config(task["model"])
    model.fit(dataset, evals_result=evals_result)
    pred = model.predict(dataset, segment='test')
    label = dataset.prepare(segments='test', col_set='label', data_key='infer').squeeze()
    pred_label = pd.concat([pred, label], axis=1)
    ic = pred_label.groupby(level=0).corr()
    ic = ic.loc[(slice(None), ic.columns[0]), ic.columns[-1]].mean()
    return ic

def objective_mlp(trial):
    nlayers = trial.suggest_int("nlayers", 1, 5)
    layers = [trial.suggest_int(f"layer_{i}", 256, 2048, log=True) for i in range(nlayers)]
    task = {
        "model": {
            "class": "qlib.contrib.model.pytorch_nn.DNNModelPytorch",
            "kwargs": {
                "lr": trial.suggest_float('lr', 1e-3, 1e-1),
                "lr_decay": trial.suggest_float('lr_decay', 1e-2, 9.99e-1),
                "lr_decay_steps": trial.suggest_int("lr_decay_steps", 100, 1000, step=100),
                "optimizer": 'adam',
                "max_steps": trial.suggest_int("max_steps", 100, 10000, log=True),
                "batch_size": trial.suggest_int("batch_size", 100, 10000, log=True),
                "weight_decay": trial.suggest_float("weight_decay", 1e-5, 1e-2, log=True),
                "pt_model_kwargs": {
                    'input_dim': 85,
                    'layers': layers,
                },
            },
        },
    }
    evals_result = dict()
    model = init_instance_by_config(task["model"])
    model.fit(dataset, evals_result=evals_result)
    pred = model.predict(dataset, segment='test')
    pred_label = pd.concat([pred, label], axis=1)
    ic = pred_label.groupby(level=0).corr()
    ic = ic.loc[(slice(None), ic.columns[0]), ic.columns[-1]].mean()
    return ic

name = 'mlp'

In [None]:
study = optuna.create_study(
    study_name = name, 
    direction='maximize', 
    storage=f'sqlite:///{name}.db',
    load_if_exists=True,
)
study = optuna.Study(study_name=name, storage=f'sqlite:///{name}.db')
study.optimize(objective_mlp, n_trials=100)

### Result Analyze

After the parameters tuning, our trial result and parameters are stored in the db data file

In [None]:
import optuna
study = optuna.Study(study_name=name, storage=f'sqlite:///{name}.db')

In [None]:
import optuna.visualization as optviz

# optviz.plot_optimization_history(study)
# optviz.plot_parallel_coordinate(study)
# optviz.plot_contour(study, params=['max_depth', 'num_leaves'])
optviz.plot_slice(study, params=['batch_size'])
# optviz.plot_param_importances(study)

In [None]:
study.best_params

## Model Fusion

In [None]:
# add epoch end evaluation, use layering return as a indicator
# use adamw optimizer
# reduce data loss, use k fold, merge the valid and test dataset into one
# maybe use open price can perform better
# CNN without pooling
# TabNet has better performance