# AE4350 - Bio-inspired Intelligence and Learning Assignment
This is the Main notebook for training and testing the DRL portfolio management system
<br>Created on Thu May 12 13:22:50 2022
<br>@author: Reinier Vos, 4663160-TUD

In [None]:
SCRIPT_VERSION = 17
try:
    from utility_v17 import Agent, UtilFuncs, Statistics
except:
    pass
try:
    from AE4350_Assignment.utility_v17 import Agent, UtilFuncs, Statistics
except:
    pass
import sys
from tqdm import tqdm
from tqdm.notebook import trange
import os
import numpy as np
import pandas as pd
import json
import plotly.graph_objects as go
import tensorflow as tf
import plotly.graph_objects as pgo
import time

import logging

SEED = 10
np.random.seed(SEED)
tf.random.set_seed(SEED)
import matplotlib.pyplot as plt
import re

print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
print("\nATTENTION RUNNNING SCRIPT VERSION = {}\n".format(SCRIPT_VERSION))

#### Attention:
<br> Please run all cells in order unless advised otherwise
<br> This version is the google colab compatible version, change next cell for desired environment

##### Step 1) adjust the booleans in the following cell according to the environment you are working in.

In [None]:
using_colab = False # whether youre running the script in colab
pay = False # whether youre using a payed colab version (i.e. own server)

if using_colab:
    from google.colab import files
    if not pay:
        from google.colab import drive
        import shutil
        drive.mount('/content/drive', force_remount=True)
    !git clone https://github.com/rwvosTUD/proj1.git

#### System configuration definition
##### Step 2) adjust the following configuration details according to your own experiment objective.
<br> Notice that certain settings are experimental and should not be changed unless you have a specific reason to do so.

In [None]:
checkpoint_dir = "TEST" # folder to save this runs results to

modelsHyper_dct = {"actor_ts_dLayers":[256, 256, 256, 128, 64], # layers and hidden units for the actor's time series input 
                   "actor_util_dLayers":[], # "" for the actor's portfolio input 
                   "actor_comb_dLayers":[128, 128, 64, 64, 32], # "" for the actor's combined layers after concatenation  
                   "actor_regularizer":1e-14, # Actor L2 regularization magnitude
                   "critic_ts_dLayers":[256, 256, 256, 128, 64], # layers and hidden units for the actor's time series input 
                   "critic_util_dLayers":[], # "" for the actor's portfolio input
                   "critic_comb_dLayers":[128, 128, 64, 64, 32], # "" for the critic's combined layers after concatenation  
                   "critic_action_dLayers":[], # "" for the critic's action probability inputs
                   "critic_final_dLayers":[128, 128, 64, 64, 32], # "" for the critic's final concatenated inputs
                   "critic_regularizer":1e-14, # Critic L2 regularization magnitude
                   "use_batchNorm_tsdense":True, # Whether to use batch normalization for every layer in the actor and critic TS layers 
                   "use_dropout_tsdense":True, # Whether to use dropout for every layer in the actor and critic TS layers 
                   "ts_dropoutProb":0.2, # Dropout probability, default 0.2
}

agent_dct = {"stateTS_size":64, # lookback window size for the time series state 
             "stateUT_size":6, # portfolio state size, do not adjust
             "batch_size":128,
             "buffer_size": 1000000, # replay buffer size
             "data_extraWindow":0, # EXPERIMENTAL, do not change
             "n_budget":1, # EXPERIMENTAL, do not change
             "is_terminal_threshold":1000, # EXPERIMENTAL, do not change
             "model_hyper": modelsHyper_dct, 
             "train_tanh":80, # training input scaling factor
             "vali_tanh":104.26426426426426,# validation input scaling factor
             "test_tanh":130.1301301301301,# test input scaling factor
             "gamma":0.99, # discount factor for rewards
             "tau":0.001, # soft update parameter
             "mask_input":False, # EXPERIMENTAL, do not change
             "subset_training":True, # whether to enable the trainer subsystem
             "subset_window": 300, # trial size used when subset training
}

reward_dct = {"rewardType":7, # reward function to use, adviseable keep this at 7 which is the report's preferred function
              "penalty":0, # EXPERIMENTAL, do not change
              "hold_scale": 17, # exp sclaing factor for hold penalty
              "trade_scale":14, # EXPERIMENTAL, do not change
              "trade_cost":0, # transaction cost for a trade
              "max_holds":100, # maximum holds after penalty is introduced, minimum hardcoded to 100!
              "prob_power":1, # EXPERIMENTAL, do not change
}
trainer_dct = {"EXTRACASH" : reward_dct["trade_cost"], 
            "EXPAND" : 10, # expansion size
            "LAST" : 5, # amount of previous profits to consider for expansion timer
            "PROFITDIFF" : 200, # minimum profit to be observed before expansion
            "EXPAND_TIMER":  10, # how many runs have to exceed profitdiff threshold before expansion
            "TRADECOST_ACTUAL" : 3, # actual trade cost
             "START_OFFSET" : 300, # first subset window used by trainer
             "VALI_EC":0, # extracash amount for validation
             "SCRIPT_VERSION":SCRIPT_VERSION,
             "SEED":SEED,
}

##### Step 3) Initialize model and obtain data

In [None]:
data, data_extra = UtilFuncs.get_data("traindata",agent_dct["data_extraWindow"],
                                      SCRIPT_VERSION, colab = using_colab,)
l = len(data) - 1

window_size = agent_dct["stateTS_size"]
data_extraWindow =  agent_dct["data_extraWindow"]
agent = Agent(agent_dct, data[window_size],
              checkpoint_dir, reward_dct, trainer_dct) 
stats = Statistics(checkpoint_dir, training = True)
stats_val = Statistics(checkpoint_dir, training = False)
stats_rerun = Statistics(checkpoint_dir, training = False)
print("=== ATTENTION: running model for {} stocks ===".format(agent_dct["n_budget"]))
print(l)
if agent.subset_training:
    print("== Overview of first subset for training ==")
    plt.plot(data[window_size+agent.START_OFFSET:(agent.START_OFFSET+window_size+agent.subset_window)])
    plt.grid(True)

### Loading of models
##### Step 4) Load previous model's weights, replay buffer and other relevant histories
Skip this step if no model is to be loaded.

In [None]:
load_dir = "runs/v17/content/v17_w80_p2" 
    
load_episode = 0
agent.load_models(load_dir,load_episode, buffer = True, using_colab = using_colab)

### Validation data loading & plotting
##### Step 5) Load in validation data and evaluate it visually


In [None]:
growth_buyhold = UtilFuncs.plot_data(agent, data, data_extra, data_extraWindow, window_size, training = True)
data_val, data_extra_val = UtilFuncs.get_data("validationdata", data_extraWindow, SCRIPT_VERSION,  colab = using_colab)
growth_buyhold_val = UtilFuncs.plot_data(agent, data_val, data_extra_val, data_extraWindow, window_size, training = False)
data_val = np.append(data[-window_size:],data_val)
l_val = len(data_val)-1

# Main training & validation loop
##### Step 6) Set last values before starting training

In [None]:
saveIter = 30 # at which iteration multiple to save results
start = 0 # at which epoch to start
episode_count = 10000 # howmany epochs to run

##### Step 7) Train system and track results at bottom of cell
Please be advised that training can be exhaustive and it can take > 600 epochs before the system is at 
a stage that out-of-sample performance will become decent. 

In [None]:
show_figs = False
debug = False
testing = False
#''' # uncomment this when resetting a trial 
reward_lst = []
profitdiff_lst = []
expansions_lst = []
expand_i = 1

history = {}
history["training_profit"] = []
history["training_pratio"]  = []
history["validation_profit"] = []
history["validation_pratio"] = []
history["validation_extraCash"] = []
#'''
deadlock_on = False
timer = 0 
expansion_size = 1 # start expansion_size at 1, will be set to EXPANSION after first
use_terminateFunc = True
terminateFunc_on = False
#'''
deadlock_probStart = 1/10 #1/6  # exploratory probability hack for actions 1 & 2
decay = 0.90
decay_op = 2-decay
deadlock_prob = deadlock_probStart*(decay**(start)) # if we start later in an epoch
switch_episode = 50
#'''
#extra_explore = False # whether to override the original exploration sheme
#scaling = 0.1 #0.02
tbzip_folder="/content/{0}_z.zip".format(checkpoint_dir)
zip_folder = "/content/{0}".format(checkpoint_dir)

min_t = window_size 
stats.reset_all(agent.n_budget*data[window_size], growth_buyhold)
stats_val.reset_all(agent.n_budget*data_val[window_size], growth_buyhold_val)
print("\n ATTENTION:")
#print("DEADLOCK IS SET TO {0}".format(deadlock_on))
print("SUBSET TRAINING = {0}\n".format(agent.subset_training))
prev_terminate = True


for e in range(start,episode_count):
    agent.is_eval = False # training!
    
    if e % saveIter == 0 and e != 0:
        episode_start = window_size
        episode_end = l
        agent.is_eval = True # over entire training set!
    else:
        ''' 
        SHUFFLING START 
        ''' 
        
        
        #deadlock_go = False
        if (e-start) > agent.LAST and agent.subset_training:
            profitdiff_mean = np.mean(profitdiff_lst[-agent.LAST:])

            if profitdiff_mean <= 0:
                # EXPONENTIALLY GROW
                deadlock_prob = min(deadlock_prob*decay_op,deadlock_probStart) 
                timer = 0 # reset
            elif profitdiff_mean > agent.PROFITDIFF:
                # DECAY
                deadlock_prob = max(deadlock_prob*decay,0.0001) 
                timer += 1
            else: 
                timer = 0  # reset 
                
            print("E{0} - Deadlock probability {1} | Timer = {2} | expansions = {3} | Prev mean = {4}".format(e,
                                                                         round(deadlock_prob,3), 
                                                                         timer,
                                                                         expand_i-1,
                                                                         profitdiff_mean))
            if timer >= agent.EXPAND_TIMER: 
                expand_i += 1 
                timer = 0
                expansion_size = agent.EXPAND
                #deadlock_go = True
                
        if agent.subset_training:
            utils_start = [l,agent.START_OFFSET, agent.subset_window]
            episode_start  = UtilFuncs.get_episodeStart(agent, expand_i, expansion_size, utils_start)
            episode_end = episode_start+agent.subset_window
            print("E{2} - Current window [{0},{1}]".format(episode_start,episode_end,e))
        else:
            # dont use subset training!
            episode_start = window_size
            episode_end = l
        
    pbar = tqdm(total=(episode_end-episode_start), position=0, leave=True, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} ') # progress bar
    
    agent.reset(data[episode_start])
    agent.balance += agent.EXTRACASH
    stats.reset_episode()
    stats.extraCash += agent.EXTRACASH 

    sold_price = 0
    bought_price = agent.inventory_value
    utils_state = [episode_end, stats.n_holds,stats.n_trades, agent.trade_cost, agent.train_tanh]
    state = UtilFuncs.get_state(agent, data, episode_start, window_size + 1, utils_state)
    
    done = False
    terminate = False
    # =============================== ITERATION ======================================================
    for t in range(episode_start,episode_end):

        utils_act = []
        action, action_prob = agent.take_action(state, utils_act)
        
        # DEADLOCK EXPERIMENT
        action = UtilFuncs.break_deadlock(agent,action,e,utils_act, on = deadlock_on) 
            
        # Hanle action
        flags = [use_terminateFunc, terminateFunc_on]
        utils_hdlAct = [action_prob]
        action, profit,  impossible, terminate, term_msg = UtilFuncs.handle_action(agent, stats, action, data, 
                                                                                   t, flags, utils_hdlAct, training = True)
        
        # terminate if required
        if terminate or t == episode_end-1:
            # terminating i.e. agent.LAST iteration 
            done = True # terminal state has no 'next' so change done variable before use in next lines
            
        # get reward
        if t < data_extraWindow:
            ptn = data_extra[t]
        else: 
            ptn = data[t-data_extraWindow]
        utils_reward = [data[t],data[t-1], data[t+1], action, action_prob[0], stats.n_trades, stats.n_holds, impossible, l, terminate]
        reward = agent.get_reward(agent, profit, utils_reward, done)
        stats.total_reward += reward
        
        # take step
        utils_state = [episode_end, stats.n_holds,stats.n_trades, agent.trade_cost, agent.train_tanh]
        next_state = UtilFuncs.get_state(agent, data, t + 1, window_size + 1, utils_state)
        if e % saveIter != 0 or e == 0: 
            actor_local_loss = agent.take_step(action_prob, reward, next_state, done)
        state = next_state
        
        if terminate:
            # ensure consistent length of arrays then terminate trial
            if t >= min_t:
                utils_pad = [l,t]
                stats.pad_on_terminate(utils_pad)    
            print("Episode {0} was terminated at {1}/{2} due to {3}".format(e,t-window_size,episode_end-episode_start, term_msg))
            break
            
        # collect 
        utils_saveIter = [profit, reward, actor_local_loss, action, t-episode_start]
        stats.collect_iteration(agent,utils_saveIter)
        
        pbar.set_description("Episode {0}| Portfolio: {1}| Balance: {2}| Inventory: {3}| RewardAcc: {4}".format(int(e),
                                                                                                                UtilFuncs.to_currency(agent.balance+agent.inventory_value),
                                                                                                                UtilFuncs.to_currency(agent.balance),
                                                                                                                UtilFuncs.to_currency(agent.inventory_value),
                                                                                                                UtilFuncs.to_currency(stats.total_reward)))
        pbar.update()
        
    # ============================ END ITERATION =================================================
    print("E{3} - Amount of +trades {0}, all trades {1} | ratio = {2}".format(stats.n_posiProfits, 
                                                                              stats.n_trades, 
                                                                              round(stats.n_posiProfits/max(1,stats.n_trades),3), 
                                                                              e))

    profitBuyhold = data[episode_end-1]-data[episode_start]
    profitRL = agent.balance+agent.inventory_value-data[episode_start]-agent.TRADECOST_ACTUAL*stats.n_trades-stats.extraCash
    profitdiff = profitRL-profitBuyhold
    print("E{3} - RL profit = {0} | Buyhold = {1} | diff = {2} | tradecost = {4}".format(round(profitRL,2),
                                                                round(profitBuyhold,2),
                                                                round(profitdiff,2),
                                                                e,
                                                                #agent.trade_cost*stats.n_trades,
                                                                agent.TRADECOST_ACTUAL*stats.n_trades,
                                                                ))
    print("E{0} - impossibles {1}/{2} = {3}".format(e, stats.n_impossible,
                                             stats.n_1or2,
                                             round(stats.n_impossible/stats.n_1or2,3)))
    reward_lst.append(stats.total_reward)
    profitdiff_lst.append(profitdiff) 
    expansions_lst.append(expand_i)

    if max(e-1,0) % saveIter == 0 and e != 0 and e != 1 and not debug:
        if not prev_terminate:
            # we skip one iteration as we assume the zip file creation is done now
            if using_colab and not pay: 
                shutil.copy(tbzip_folder, "/content/drive/MyDrive") # save result to drive
    if e % saveIter != 0 or e == 0: 
        continue
        
    if True: #not terminate:
        stats.collect_episode(agent,e, [])
        history["training_profit"].append(stats.compete[-1])
        history["training_pratio"].append(stats.n_posiProfits/max(1,stats.n_trades))
        
    if e % saveIter == 0 and e != 0: 
        if not debug:
            # plot and save
            agent.save_models(e)
            utils_fig = [l, window_size]
            stats.plot_figure(data, e, utils_fig, show_figs = show_figs)
            stats.save_statistics(e)
            extraHistory = {}
            extraHistory["reward_lst"] = reward_lst
            extraHistory["profitdiff_lst"] = profitdiff_lst
            extraHistory["expansions_lst"] = expansions_lst 

            with open(f'./{checkpoint_dir}/EXTRAhistory.json', 'w') as fp:
                json.dump(extraHistory, fp)
    
            # ================ VALIDATION LOOP ===============================
            stats_val.reset_episode()
            stats_val.extraCash += agent.VALI_EC
            agent.is_eval = True
            agent.reset(data_val[window_size])
            agent.balance += agent.VALI_EC
            for t in trange(window_size,l_val):
                utils_state = [l_val, stats_val.n_holds,stats_val.n_trades, agent.trade_cost, agent.vali_tanh]
                state = UtilFuncs.get_state(agent, data_val, t, window_size + 1, utils_state)
                utils_act = [deadlock_prob,data_val[t]]
                action, action_prob = agent.take_action(state, utils_act)

                flags = [use_terminateFunc, terminateFunc_on]
                utils_hdlAct = [action_prob]
                action, profit,  impossible, _, _ = UtilFuncs.handle_action(agent, stats_val, action, data_val, 
                                                                                           t, flags, utils_hdlAct, training = False)
                utils_saveIter = [profit, 0., 0., action, t-window_size]
                stats_val.collect_iteration(agent,utils_saveIter)

            stats_val.collect_episode(agent, e, [])
            print("Final validation profit = {0} | extra cash = {1} | +/all trades= {2}/{3}".format(round(stats_val.compete[-1],2),
                                                                            round(stats_val.extraCash,2),
                                                                              stats_val.n_posiProfits,
                                                                               stats_val.n_trades,))
            utils_fig = [l_val, window_size]
            stats_val.plot_figure(data_val, e, utils_fig, show_figs = show_figs)
            history["validation_profit"].append(stats_val.compete[-1])
            history["validation_pratio"].append(stats_val.n_posiProfits/max(1,stats_val.n_trades))
            history["validation_extraCash"].append(stats_val.extraCash)
            # ============= END VALIDATION LOOP =============================
            with open(f'./{checkpoint_dir}/e{e}/history.json', 'w') as fp:
                json.dump(history, fp)
            
            
        if using_colab and not debug:
            !zip -r "$tbzip_folder" "$zip_folder"
            print("Zip file created, saved next run")
        
    if terminate:
        prev_terminate = True
    else:
        prev_terminate = False
