In [1]:
import numpy as np
import config.config as config
from Environment import Env
from Agent_LSTM import *
from pathlib import Path

import sys; sys.path.append('../analysis/')
from my_utils import reset_seeds

In [2]:
def training(datapath, seed_number, Actor, Critic, 
             expnoise_std=0.8, TOTAL_EPISODE=5e4, value_noise_std=0):
    # get configures
    arg = config.ConfigGain(datapath)
    arg.SEED_NUMBER = seed_number
    arg.save()
    
    # reproducibility
    reset_seeds(arg.SEED_NUMBER)

    # initialize environment and agent
    env = Env(arg)
    agent = Agent(arg, Actor, Critic)
    agent.actor.value_noise_std = value_noise_std
    
    # define exploration noise
    noise = ActionNoise(arg.ACTION_DIM, mean=0, std=expnoise_std)
    
    # Loop now
    tot_t = 0
    episode = agent.initial_episode
    reward_log = []
    rewarded_trial_log = []
    step_log = []
    actor_loss_log = 0
    critic_loss_log = 0
    num_update = 1e-5
    dist_log = []

    LOG_FREQ = 100
    REPLAY_PERIOD = 4
    PRE_LEARN_PERIOD = arg.BATCH_SIZE * 50
    random_stop = True
    pre_phase = True

    # Start loop
    while episode < TOTAL_EPISODE:
        # initialize a trial
        cross_start_threshold = False
        x = env.reset()
        agent.bstep.reset(env.pro_gains)
        
        last_action = torch.zeros(1, 1, arg.ACTION_DIM)
        last_action_raw = last_action.clone()

        state = torch.cat([x[-arg.OBS_DIM:].view(1, 1, -1), last_action,
                           env.target_position_obs.view(1, 1, -1), 
                           torch.zeros(1, 1, 1)], dim=2).to(arg.device)

        hiddenin = None
        tend = 0

        states = []
        actions = []
        rewards = []
        dones = []

        for t in range(arg.EPISODE_LEN):
            # 1. Check start threshold.
            if not cross_start_threshold and (last_action.abs() > arg.TERMINAL_ACTION).any():
                cross_start_threshold = True

            # 2. Take an action based on current state 
            # and previous hidden & cell states of LSTM units.
            action, action_raw, hiddenout = agent.select_action(state, hiddenin, action_noise=noise)
            if random_stop and np.random.rand() > 0.95:
                action = torch.zeros_like(action)

            # 3. Track next x in the environment.
            next_x, reached_target, relative_dist = env(x, action, t - tend)

            # 4. Next observation given next x.
            next_ox = agent.bstep(next_x)
            next_state = torch.cat([next_ox.view(1, 1, -1), action, env.target_position_obs.view(1, 1, -1),
                                    torch.ones(1, 1, 1) * t - tend + 1], dim=2).to(arg.device)

            # 5. Check whether stop.
            is_stop = env.is_stop(x, action)

            # 6. Give reward if stopped.          
            if is_stop and cross_start_threshold:
                reward = env.return_reward(x, reward_mode='mixed')
                done = torch.ones(1, 1, 1)
            else:
                reward = torch.zeros(1, 1, 1)
                done = torch.zeros(1, 1, 1)

            # 7. Append data.
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)

            # 8. Update timestep.
            last_action_raw = action_raw
            last_action = action
            state = next_state
            x = next_x
            hiddenin = hiddenout
            tot_t += 1

            # 9. Update model.
            if len(agent.memory.memory) > PRE_LEARN_PERIOD and tot_t % REPLAY_PERIOD == 0:
                actor_loss, critic_loss = agent.learn()
                actor_loss_log += actor_loss
                critic_loss_log += critic_loss
                num_update += 1

            # 10. whether break.
            if is_stop and cross_start_threshold:
                step_log.append(t + 1 - tend)
                reward_log.append(reward.item())
                rewarded_trial_log.append(int(reached_target & is_stop))
                dist_log.append(relative_dist.item())
                # initialize a trial
                cross_start_threshold = False
                x = env.reset()
                agent.bstep.reset(env.pro_gains)
                state = torch.cat([x[-arg.OBS_DIM:].view(1, 1, -1), last_action,
                                   env.target_position_obs.view(1, 1, -1),
                                   torch.zeros(1, 1, 1)], dim=2).to(arg.device)
                tend = t + 1


        # store the last state
        states.append(state)
        # End of one trial, store trajectory into buffer.
        states = torch.cat(states)
        actions = torch.cat(actions).to(arg.device)
        rewards = torch.cat(rewards).to(arg.device)
        dones = torch.cat(dones).to(arg.device)
        agent.memory.push(states, actions, rewards, dones) 

        # store mirrored trajectories reflected along y-axis
        agent.memory.push(*agent.mirror_traj(states, actions), rewards, dones) 

        if episode % LOG_FREQ == LOG_FREQ - 1:
            # save
            agent.save(save_memory=False, episode=episode, pre_phase=pre_phase, full_param=True)
            
            print(f"t: {tot_t}, Ep: {episode}, action std: {noise.std:0.2f}")
            print(f"mean steps: {np.mean(step_log):0.3f}, "
                  f"mean reward: {np.mean(reward_log):0.3f}, "
                  f"reward rate: {np.sum(reward_log) / np.sum(step_log):0.3f}, "
                  f"rewarded fraction: {np.mean(rewarded_trial_log):0.3f}, "
                  f"relative distance: {np.mean(dist_log) * arg.LINEAR_SCALE:0.3f}, "
                  f"critic loss: {critic_loss_log / num_update:0.3f}, "
                  f"actor loss: {-actor_loss_log / (num_update/2):0.3f}")
            
            reward_rate = np.sum(reward_log) / np.sum(step_log)
            if pre_phase and reward_rate > 0.2:
                random_stop = False
                pre_phase = False
                episode = 0
                
            if not pre_phase:
                noise.reset(mean=0, std=0.5)  
                
            if np.mean(rewarded_trial_log) > 0.85:
                break

            reward_log = []
            rewarded_trial_log = []
            step_log = []
            actor_loss_log = 0
            critic_loss_log = 0
            num_update = 1e-5
            dist_log = []
            
        episode += 1

In [4]:
actors = ['Actor_novalue']
critics = ['Critic']
seeds = [[21,22]]
expnoise_std = 0.8
TOTAL_EPISODE = 1e4
folder_path = Path('D:/quitting_data/agents')

value_noise_std = 0

In [5]:
for actor, critic, seed_ in zip(actors, critics, seeds):
    for seed in seed_:
        datapath = folder_path / f'{actor}{critic}' / f'seed{seed}' / 'episodic'
        exec(f'from {actor} import *'); exec(f'from {critic} import *')
        training(datapath, seed, Actor, Critic, expnoise_std, TOTAL_EPISODE, 
                 value_noise_std)

t: 10000, Ep: 99, action std: 0.80
mean steps: 5.000, mean reward: 0.010, reward rate: 0.002, rewarded fraction: 0.000, relative distance: 352.954, critic loss: 0.000, actor loss: 0.000
t: 20000, Ep: 199, action std: 0.80
mean steps: 4.920, mean reward: 0.010, reward rate: 0.002, rewarded fraction: 0.000, relative distance: 353.863, critic loss: 0.000, actor loss: 0.000
t: 30000, Ep: 299, action std: 0.80
mean steps: 5.000, mean reward: 0.010, reward rate: 0.002, rewarded fraction: 0.000, relative distance: 342.342, critic loss: 0.000, actor loss: 0.000
t: 40000, Ep: 399, action std: 0.80
mean steps: 4.545, mean reward: 0.010, reward rate: 0.002, rewarded fraction: 0.000, relative distance: 346.930, critic loss: 0.000, actor loss: 0.000
t: 50000, Ep: 499, action std: 0.80
mean steps: 10.593, mean reward: 0.077, reward rate: 0.007, rewarded fraction: 0.004, relative distance: 287.132, critic loss: 0.008, actor loss: 0.016
t: 60000, Ep: 599, action std: 0.80
mean steps: 16.602, mean rewa

t: 250000, Ep: 2499, action std: 0.80
mean steps: 15.875, mean reward: 0.016, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 420.202, critic loss: 0.002, actor loss: -0.020
t: 260000, Ep: 2599, action std: 0.80
mean steps: 16.448, mean reward: 0.052, reward rate: 0.003, rewarded fraction: 0.004, relative distance: 404.761, critic loss: 0.002, actor loss: -0.019
t: 270000, Ep: 2699, action std: 0.80
mean steps: 16.655, mean reward: 0.017, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 409.400, critic loss: 0.003, actor loss: -0.019
t: 280000, Ep: 2799, action std: 0.80
mean steps: 16.876, mean reward: 0.015, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 402.380, critic loss: 0.003, actor loss: -0.019
t: 290000, Ep: 2899, action std: 0.80
mean steps: 16.933, mean reward: 0.042, reward rate: 0.002, rewarded fraction: 0.002, relative distance: 413.986, critic loss: 0.004, actor loss: -0.018
t: 300000, Ep: 2999, action std: 0.80
mean st

KeyboardInterrupt: 