In [None]:
import warnings
warnings.filterwarnings("ignore") # suppress h5py deprecation warning

import os
import backtrader as bt
import numpy as np

from btgym import BTgymEnv, BTgymDataset
from btgym.strategy.observers import Reward, Position, NormPnL
from btgym.algorithms import Launcher, Unreal, AacStackedRL2Policy
from btgym.research.strategy_gen_4 import DevStrat_4_11

### Stacked LSTM Agent usage example.

Based on NAV_A3C+D from ["LEARNING TO NAVIGATE IN COMPLEX ENVIRONMENTS"](https://arxiv.org/pdf/1611.03673.pdf) paper by Mirowski at al.;

Modifications to original paper arhcitecture:
- splitted Policy/Value outputs: Policy is taken off first LSTM layer, Value - off the second;
- LSTM state initialisation: first RNN layer context (policy) is initialised on every episode start, while second   (Value) is reset either on begining of every Trial (future work) or or every N-constant episodes (60 for this     example), motivated by RL^2 approach by Duan et al., 
  ["FAST REINFORCEMENT LEARNING VIA SLOW REINFORCEMENT LEARNING"](https://arxiv.org/pdf/1611.02779.pdf);
- inner/external observation state state split: external (market) is encoded via conolution layers and fed to       first LSTM layer, inner (broker) state is fed into second LSTM layer, can optionally be encoded via separate       convolution block (doesnt seem to improve much though);
- optional Value Replay losss (`Unreal` feature) improves sample efficiency, but is computationally expensive;

Other details:
- All convolution and LSTM layers are layer-normalized, see 
  ["Layer Normalisation"](https://arxiv.org/abs/1607.06450) paper by Jimmy Ba at al.;
  
- Upd 2.02.18: linear layers are Noisy-Net ones, see: [Noisy Networks for Exploration] (https://arxiv.org/abs/1706.10295) paper by Fortunato at al.; policy output is centered using layer normalisation;
 added linearly decayed state scaling;

- A3C option `time_flat` is ON by default, improves training stability, reduces computation costs, see 
  [Base_AAC class Note](https://kismuz.github.io/btgym/btgym.algorithms.html#module-btgym.algorithms.aac) for       details;
  
Diagram: https://kismuz.github.io/btgym/_images/a3c_stacked_lstm_agent.png

**NOTE:**
Currently it takes ~20M env.steps to fit 6-month 1min bars data set. Much faster on smaller ones.

In [None]:
# Set backtesting engine parameters:

MyCerebro = bt.Cerebro()

# Define strategy and broker account parameters:
MyCerebro.addstrategy(
    DevStrat_4_11,
    start_cash=2000,  # initial broker cash
    commission=0.0001,  # commisssion to imitate spread
    leverage=10.0,
    order_size=2000,  # fixed stake, mind leverage
    drawdown_call=10, # max % to loose, in percent of initial cash
    target_call=10,  # max % to win, same
    skip_frame=10,
    gamma=0.99,
    reward_scale=7, # gardient`s nitrox, touch with care!
    state_ext_scale = np.linspace(3e3, 1e3, num=5)
)
# Visualisations for reward, position and PnL dynamics:
MyCerebro.addobserver(Reward)
MyCerebro.addobserver(Position)
MyCerebro.addobserver(NormPnL)

# Data: uncomment to get up to six month of 1 minute bars:
data_m1_6_month = [
    './data/DAT_ASCII_EURUSD_M1_201701.csv',
    './data/DAT_ASCII_EURUSD_M1_201702.csv',
    './data/DAT_ASCII_EURUSD_M1_201703.csv',
    './data/DAT_ASCII_EURUSD_M1_201704.csv',
    './data/DAT_ASCII_EURUSD_M1_201705.csv',
    './data/DAT_ASCII_EURUSD_M1_201706.csv',
]

# Uncomment single choice:
MyDataset = BTgymDataset(
    #filename=data_m1_6_month,
    filename='./data/test_sine_1min_period256_delta0002.csv',  # simple sine 
    start_weekdays={0, 1, 2, 3, 4, 5, 6},
    episode_duration={'days': 1, 'hours': 23, 'minutes': 40}, # note: 2day-long episode
    start_00=False,
    time_gap={'hours': 10},
)

env_config = dict(
    class_ref=BTgymEnv, 
    kwargs=dict(
        dataset=MyDataset,
        engine=MyCerebro,
        render_modes=['episode', 'human', 'internal', ], #'external'],
        render_state_as_image=True,
        render_ylabel='OHL_diff. / Internals',
        render_size_episode=(12,8),
        render_size_human=(9, 4),
        render_size_state=(11, 3),
        render_dpi=75,
        port=5000,
        data_port=4999,
        connect_timeout=90,
        verbose=0,
    )
)

cluster_config = dict(
    host='127.0.0.1',
    port=12230,
    num_workers=4,  # set according CPU's available or so
    num_ps=1,
    num_envs=1,
    log_dir=os.path.expanduser('~/tmp/test_4_11'),  # current checkpoints and summaries are here
    initial_ckpt_dir=os.path.expanduser('~/tmp/pre_trained_model/test_4_11'),  # load pre-trained model, if chekpoint found  
)

policy_config = dict(
    class_ref=AacStackedRL2Policy,
    kwargs={
        'lstm_layers': (256, 256),
        'lstm_2_init_period': 60,
    }
)

trainer_config = dict(
    class_ref=Unreal,
    kwargs=dict(
        opt_learn_rate=[1e-4, 1e-4], # random log-uniform 
        opt_end_learn_rate=1e-5,
        opt_decay_steps=50*10**6,
        model_gamma=0.99,
        model_gae_lambda=1.0,
        model_beta=0.05, # entropy reg
        rollout_length=20,
        time_flat=True, 
        use_value_replay=False, 
        model_summary_freq=10,
        episode_summary_freq=1,
        env_render_freq=2,
    )
)

In [None]:
launcher = Launcher(
    cluster_config=cluster_config,
    env_config=env_config,
    trainer_config=trainer_config,
    policy_config=policy_config,
    test_mode=False,
    max_env_steps=100*10**6,
    save_secs=300,  # save checkpoint every N seconds (default is 600)
    root_random_seed=0,
    purge_previous=1,  # ask to override previously saved model and logs
    verbose=0
)

# Train it:
launcher.run()

In [None]:
# Save, restore or resume:

# Use launcher.export_checkpoint() method to save most recent trained model parameters to external directory; 
# one can load it as pre-trained model for next run via cluster_gongig -> initial_ckpt_dir arg, (see above).
#
# Note: 
# 1. when loading pre-trained model, training is started at global_step=0 unlike
#    restoring from current checkpoint, when training resumes from last saved global_step value;
# 2. answering Yes to Launcher's `Override[y/n]?` affects log_dir content only;
# 3. launcher now got 'save_secs' arg, cpecifying how often checkpoints should be written. Default value is 600;
# 4. exporting checkpoint overrides content of destination folder.
#
# Launcher starting routine:
# 1. if initial_ckpt_dir is given - try to load pre-trained model and start at step=0 if succeeded;
# 2. if failed - look for routinely saved checkpoint and if succeeded - resume training at step found in that point;
# 3. if that fails - start training from scratch.

launcher.export_checkpoint(os.path.expanduser('~/tmp/pre_trained_model/test_4_11'))