A series of Qlib based kernels. Welcome to explore more details in [Qlib](http://github.com/microsoft/qlib)
- [A Different EDA based on Qlib\[EN/中文\]](https://www.kaggle.com/youngyang/a-different-eda-based-on-qlib-en/)
- [A Naive Qlib Example\[EN/中文\]](https://www.kaggle.com/youngyang/qlibnaiveexample-en/)

This is a very Naive example of Qlib.  The score is not high. It aims to demonstrate a short and easy example to use Qlib.
Welcome to explore more Quant ML  models implemnted in Qlib.

这是一个非常初级的Qlib样例。 分数不高， 它的主要目的是为了展示一个简单的基于Qlib做预测的样例。
欢迎基于这个样例来探索在Qlib中的更多基于 ML 的Quant模型

# Set Qlib Env

In [None]:
import sys
sys.path.insert(0, '../input/qlib-dev-w/packages')

In [None]:
import gc
from typing import Union
import qlib
from qlib.workflow import R
import numpy as np
import pandas as pd
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset.loader import StaticDataLoader
from qlib.data.dataset import DatasetH
from qlib.contrib.model.pytorch_nn import DNNModelPytorch
from sklearn.model_selection import GroupKFold
qlib.init()

# Read Data

In [None]:
def read_data(path: Union[str, pd.DataFrame]="../input/train.pkl", proc_type='train'):
    """ Read data and turn it into Qlib's format"""
    df = pd.read_pickle(path) if isinstance(path, str) else path

    if proc_type == "test": # The format of test data and training data is different
        df["time_id"] = df["row_id"].apply(lambda x: int(x.split("_")[0]))
        del test_df["row_id"]
    else:
        assert proc_type == "train"

    df = df.set_index(["time_id", "investment_id"])
    df.columns = pd.MultiIndex.from_tuples([("label" if col == "target" else "feature", col) for col in df.columns])
    df.index.names = ["datetime", "instrument"]  # Qlib's processors requires datetime
    df = df.astype(np.float32)  # for supporting unstack operation
    return df
data_df = read_data("../input/ubiquant-market-prediction-half-precision-pickle/train.pkl")

# Datahandler
dh = DataHandlerLP(data_loader=StaticDataLoader(data_df), drop_raw=True)   # data processing is normally implemented in data handler
del data_df
del dh.data_loader

# Train model

In [None]:
def split_kfold(idx, fold=5):
    kfold = GroupKFold(n_splits=fold)
    cv_index = []
    for fold_id, (train_idx, valid_idx) in enumerate(kfold.split(idx, groups=idx.get_level_values("datetime"))):
        all_seg = {"train": idx[train_idx], "valid": idx[valid_idx]}
        cv_index.append(all_seg)
    return cv_index

idx = dh.fetch().index
cv_index = split_kfold(idx)  # create cross validation based on cv_index
del idx
gc.collect()

In [None]:
R.start_exp()

kwargs = {
    "lr": 0.002,
    "optimizer": "adam",
    "max_steps": 8000,
    "batch_size": 8192,
    "pt_model_kwargs": {
        'input_dim': 300,
        'layers': (256, )
    },
    "scheduler": None,
}
R.log_params(**kwargs)  # save params in experiment manager

cv_models = []
for seg in cv_index:
    ds = DatasetH(handler=dh, segments=seg)
    m = DNNModelPytorch(**kwargs)
    m.fit(ds)
    cv_models.append(m)

R.save_objects(**{"cv_models.pkl": cv_models, "handler.pkl": dh})   # save the models and data processors
R.end_exp()

# Inference

In [None]:
# Test
import ubiquant

env = ubiquant.make_env()
iter_test = env.iter_test()

def get_avg(preds):
    return sum(preds) / len(preds)

for (test_df, sample_prediction_df) in iter_test:
    # use the same data handler in both inference and training
    dh.data_loader = StaticDataLoader(read_data(test_df, proc_type="test"))  # load raw data
    dh.setup_data(init_type=dh.IT_LS)  # process data
    ds = DatasetH(handler=dh, segments={"test": slice(None)}) # inference on data

    preds = []

    # NN
    preds_nn = []
    for m in cv_models:
        preds_nn.append(m.predict(ds).values)
    preds.append(get_avg(preds_nn))
    # TODO: You may try more models

    # ensemble models
    sample_prediction_df.loc[:, "target"] = get_avg(preds)

    env.predict(sample_prediction_df)