In [1]:
import numpy as np
import config.config as config
from Environment import Env
from Agent_LSTM import *
from pathlib import Path

import sys; sys.path.append('../analysis/')
from my_utils import reset_seeds

In [2]:
def training(datapath, seed_number, Actor, Critic, TOTAL_EPISODE=5e4, value_noise_std=0,
             freeze_RNN=False, freeze_actor=False):
    # get configures
    arg = config.ConfigGain(datapath)
    arg.SEED_NUMBER = seed_number
    arg.save()
    
    # reproducibility
    reset_seeds(arg.SEED_NUMBER)

    # initialize environment and agent
    env = Env(arg)
    agent = Agent(arg, Actor, Critic)
    agent.actor.value_noise_std = value_noise_std
    agent.episodic = False
    agent.data_path = datapath.parent / 'episodic'
    
    epis = [int(v.stem.split('.')[0].split('-')[-1])
        for v in agent.data_path.glob('*.pth.tar')
        if v.stem.split('.')[0].split('-')[-1].isdigit()]
    epi_load = str(np.sort(epis)[-1])
    name_load = [v.stem.split('_')[0] for v in agent.data_path.glob('*.pkl')][0]
    name_load = '-'.join([name_load, epi_load])
    print(name_load)
    agent.load(name_load, load_memory=False, load_optimzer=True, full_param=True, load_name=False)
    agent.data_path = datapath

    if freeze_RNN and not freeze_actor:
        for param in agent.critic.rnn1.parameters():
            param.requires_grad = False
        for param in agent.critic.rnn2.parameters():
            param.requires_grad = False
        print('RNN is frozen.')
    elif freeze_actor and not freeze_RNN:
        for param in agent.actor.parameters():
            param.requires_grad = False
        print('Actor is frozen.')
    elif not freeze_actor and not freeze_RNN:
        print('No param. frozen.')
    else:
        raise ValueError
    
    # define exploration noise
    noise = ActionNoise(arg.ACTION_DIM, mean=0, std=0.5)
    
    # Loop now
    tot_t = 0
    episode = agent.initial_episode
    reward_log = []
    rewarded_trial_log = []
    step_log = []
    actor_loss_log = 0
    critic_loss_log = 0
    num_update = 1e-5
    dist_log = []

    LOG_FREQ = 100
    REPLAY_PERIOD = 4
    PRE_LEARN_PERIOD = arg.BATCH_SIZE * 50

    # Start loop
    while episode < TOTAL_EPISODE:
        # initialize a trial
        cross_start_threshold = False
        x = env.reset()
        agent.bstep.reset(env.pro_gains)
        
        last_action = torch.zeros(1, 1, arg.ACTION_DIM)
        last_action_raw = last_action.clone()

        state = torch.cat([x[-arg.OBS_DIM:].view(1, 1, -1), last_action,
                           env.target_position_obs.view(1, 1, -1), 
                           torch.zeros(1, 1, 1)], dim=2).to(arg.device)

        hiddenin = None
        tend = 0

        states = []
        actions = []
        rewards = []
        dones = []

        for t in range(arg.EPISODE_LEN):
            # 1. Check start threshold.
            if not cross_start_threshold and (last_action.abs() > arg.TERMINAL_ACTION).any():
                cross_start_threshold = True

            # 2. Take an action based on current state 
            # and previous hidden & cell states of LSTM units.
            action, action_raw, hiddenout = agent.select_action(state, hiddenin, action_noise=noise)
            if len(agent.memory.memory) <= PRE_LEARN_PERIOD and np.random.rand() > 0.95:
                action = torch.zeros_like(action)

            # 3. Track next x in the environment.
            next_x, reached_target, relative_dist = env(x, action, t - tend)

            # 4. Next observation given next x.
            next_ox = agent.bstep(next_x)
            next_state = torch.cat([next_ox.view(1, 1, -1), action, env.target_position_obs.view(1, 1, -1),
                                    torch.ones(1, 1, 1) * t - tend + 1], dim=2).to(arg.device)

            # 5. Check whether stop.
            is_stop = env.is_stop(x, action)

            # 6. Give reward if stopped.          
            if is_stop and cross_start_threshold:
                reward = env.return_reward(x, reward_mode='mixed')
                done = torch.ones(1, 1, 1)
            else:
                reward = torch.zeros(1, 1, 1)
                done = torch.zeros(1, 1, 1)

            # 7. Append data.
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)

            # 8. Update timestep.
            last_action_raw = action_raw
            last_action = action
            state = next_state
            x = next_x
            hiddenin = hiddenout
            tot_t += 1

            # 9. Update model.
            if len(agent.memory.memory) > PRE_LEARN_PERIOD and tot_t % REPLAY_PERIOD == 0:
                actor_loss, critic_loss = agent.learn()
                actor_loss_log += actor_loss
                critic_loss_log += critic_loss
                num_update += 1

            # 10. whether break.
            if is_stop and cross_start_threshold:
                step_log.append(t + 1 - tend)
                reward_log.append(reward.item())
                rewarded_trial_log.append(int(reached_target & is_stop))
                dist_log.append(relative_dist.item())
                # initialize a trial
                cross_start_threshold = False
                x = env.reset()
                agent.bstep.reset(env.pro_gains)
                state = torch.cat([x[-arg.OBS_DIM:].view(1, 1, -1), last_action,
                                   env.target_position_obs.view(1, 1, -1),
                                   torch.zeros(1, 1, 1)], dim=2).to(arg.device)
                tend = t + 1


        # store the last state
        states.append(state)
        # End of one trial, store trajectory into buffer.
        states = torch.cat(states)
        actions = torch.cat(actions).to(arg.device)
        rewards = torch.cat(rewards).to(arg.device)
        dones = torch.cat(dones).to(arg.device)
        agent.memory.push(states, actions, rewards, dones) 

        # store mirrored trajectories reflected along y-axis
        agent.memory.push(*agent.mirror_traj(states, actions), rewards, dones) 

        if episode % LOG_FREQ == LOG_FREQ - 1:
            # save
            agent.save(save_memory=False, episode=episode, pre_phase=False, full_param=False)
            
            print(f"t: {tot_t}, Ep: {episode}, action std: {noise.std:0.2f}")
            print(f"mean steps: {np.mean(step_log):0.3f}, "
                  f"mean reward: {np.mean(reward_log):0.3f}, "
                  f"reward rate: {np.sum(reward_log) / np.sum(step_log):0.3f}, "
                  f"rewarded fraction: {np.mean(rewarded_trial_log):0.3f}, "
                  f"relative distance: {np.mean(dist_log) * arg.LINEAR_SCALE:0.3f}, "
                  f"critic loss: {critic_loss_log / num_update:0.3f}, "
                  f"actor loss: {-actor_loss_log / (num_update/2):0.3f}")
            
            reward_rate = np.sum(reward_log) / np.sum(step_log)    

            reward_log = []
            rewarded_trial_log = []
            step_log = []
            actor_loss_log = 0
            critic_loss_log = 0
            num_update = 1e-5
            dist_log = []
            
        episode += 1

In [3]:
actors = ['Actor_novalue']
critics = ['Critic']
seeds = [[19,20,21]]
expnoise_std = 0.5
TOTAL_EPISODE = 1e4
folder_path = Path('D:/quitting_data/agents')

value_noise_std = 0

freeze_RNN = False; freeze_actor = False

if freeze_RNN and not freeze_actor:  
    agent_type = 'freeze_RNN'
elif freeze_actor and not freeze_RNN:
    agent_type = 'freeze_actor'
elif not freeze_RNN and not freeze_RNN:
    agent_type = 'no_freeze'

In [4]:
for actor, critic, seed_ in zip(actors, critics, seeds):
    for seed in seed_:
        datapath = folder_path / f'{actor}{critic}' / f'seed{seed}' / f'{agent_type}'
        exec(f'from {actor} import *'); exec(f'from {critic} import *')
        training(datapath, seed, Actor, Critic, TOTAL_EPISODE, 
                 value_noise_std, freeze_RNN, freeze_actor)

20240430-183238-299
No param. frozen.
t: 10000, Ep: 99, action std: 0.50
mean steps: 12.510, mean reward: 4.321, reward rate: 0.345, rewarded fraction: 0.402, relative distance: 128.326, critic loss: 0.000, actor loss: 0.000
t: 20000, Ep: 199, action std: 0.50
mean steps: 11.852, mean reward: 4.246, reward rate: 0.358, rewarded fraction: 0.404, relative distance: 133.714, critic loss: 0.000, actor loss: 0.000
t: 30000, Ep: 299, action std: 0.50
mean steps: 11.967, mean reward: 4.512, reward rate: 0.377, rewarded fraction: 0.428, relative distance: 126.045, critic loss: 0.000, actor loss: 0.000
t: 40000, Ep: 399, action std: 0.50
mean steps: 11.669, mean reward: 4.098, reward rate: 0.351, rewarded fraction: 0.383, relative distance: 135.060, critic loss: 0.000, actor loss: 0.000
t: 50000, Ep: 499, action std: 0.50
mean steps: 16.861, mean reward: 6.878, reward rate: 0.408, rewarded fraction: 0.630, relative distance: 59.141, critic loss: 1.161, actor loss: 7.713
t: 60000, Ep: 599, actio

t: 450000, Ep: 4499, action std: 0.50
mean steps: 17.197, mean reward: 8.504, reward rate: 0.494, rewarded fraction: 0.814, relative distance: 47.163, critic loss: 0.937, actor loss: 24.670
t: 460000, Ep: 4599, action std: 0.50
mean steps: 17.144, mean reward: 8.412, reward rate: 0.491, rewarded fraction: 0.801, relative distance: 48.795, critic loss: 0.947, actor loss: 24.654
t: 470000, Ep: 4699, action std: 0.50
mean steps: 17.184, mean reward: 8.451, reward rate: 0.492, rewarded fraction: 0.807, relative distance: 47.244, critic loss: 0.955, actor loss: 24.632
t: 480000, Ep: 4799, action std: 0.50
mean steps: 17.453, mean reward: 8.748, reward rate: 0.501, rewarded fraction: 0.845, relative distance: 46.259, critic loss: 0.939, actor loss: 24.615
t: 490000, Ep: 4899, action std: 0.50
mean steps: 17.219, mean reward: 8.495, reward rate: 0.493, rewarded fraction: 0.816, relative distance: 48.640, critic loss: 0.937, actor loss: 24.614
t: 500000, Ep: 4999, action std: 0.50
mean steps: 

t: 890000, Ep: 8899, action std: 0.50
mean steps: 17.165, mean reward: 8.901, reward rate: 0.519, rewarded fraction: 0.865, relative distance: 45.856, critic loss: 1.060, actor loss: 25.275
t: 900000, Ep: 8999, action std: 0.50
mean steps: 16.860, mean reward: 8.598, reward rate: 0.510, rewarded fraction: 0.832, relative distance: 48.520, critic loss: 1.089, actor loss: 25.398
t: 910000, Ep: 9099, action std: 0.50
mean steps: 17.217, mean reward: 8.764, reward rate: 0.509, rewarded fraction: 0.850, relative distance: 46.276, critic loss: 1.005, actor loss: 25.509
t: 920000, Ep: 9199, action std: 0.50
mean steps: 17.206, mean reward: 8.377, reward rate: 0.487, rewarded fraction: 0.805, relative distance: 52.620, critic loss: 1.007, actor loss: 25.555
t: 930000, Ep: 9299, action std: 0.50
mean steps: 17.236, mean reward: 8.516, reward rate: 0.494, rewarded fraction: 0.819, relative distance: 48.250, critic loss: 1.005, actor loss: 25.589
t: 940000, Ep: 9399, action std: 0.50
mean steps: 

t: 330000, Ep: 3299, action std: 0.50
mean steps: 16.880, mean reward: 8.358, reward rate: 0.495, rewarded fraction: 0.798, relative distance: 49.728, critic loss: 0.905, actor loss: 22.292
t: 340000, Ep: 3399, action std: 0.50
mean steps: 17.019, mean reward: 8.514, reward rate: 0.500, rewarded fraction: 0.813, relative distance: 48.525, critic loss: 0.896, actor loss: 22.370
t: 350000, Ep: 3499, action std: 0.50
mean steps: 17.149, mean reward: 8.805, reward rate: 0.513, rewarded fraction: 0.849, relative distance: 45.283, critic loss: 0.891, actor loss: 22.467
t: 360000, Ep: 3599, action std: 0.50
mean steps: 17.344, mean reward: 8.283, reward rate: 0.478, rewarded fraction: 0.788, relative distance: 49.115, critic loss: 0.918, actor loss: 22.528
t: 370000, Ep: 3699, action std: 0.50
mean steps: 17.281, mean reward: 8.634, reward rate: 0.500, rewarded fraction: 0.829, relative distance: 48.212, critic loss: 0.904, actor loss: 22.600
t: 380000, Ep: 3799, action std: 0.50
mean steps: 

t: 770000, Ep: 7699, action std: 0.50
mean steps: 17.175, mean reward: 8.250, reward rate: 0.480, rewarded fraction: 0.791, relative distance: 53.480, critic loss: 1.017, actor loss: 23.754
t: 780000, Ep: 7799, action std: 0.50
mean steps: 17.102, mean reward: 8.525, reward rate: 0.499, rewarded fraction: 0.821, relative distance: 49.908, critic loss: 1.015, actor loss: 23.782
t: 790000, Ep: 7899, action std: 0.50
mean steps: 17.216, mean reward: 8.390, reward rate: 0.487, rewarded fraction: 0.802, relative distance: 50.806, critic loss: 1.025, actor loss: 23.791
t: 800000, Ep: 7999, action std: 0.50
mean steps: 17.113, mean reward: 8.425, reward rate: 0.492, rewarded fraction: 0.805, relative distance: 49.353, critic loss: 1.034, actor loss: 23.812
t: 810000, Ep: 8099, action std: 0.50
mean steps: 16.772, mean reward: 8.302, reward rate: 0.495, rewarded fraction: 0.794, relative distance: 50.975, critic loss: 1.016, actor loss: 23.815
t: 820000, Ep: 8199, action std: 0.50
mean steps: 

t: 210000, Ep: 2099, action std: 0.50
mean steps: 17.340, mean reward: 8.539, reward rate: 0.492, rewarded fraction: 0.820, relative distance: 47.926, critic loss: 0.945, actor loss: 23.696
t: 220000, Ep: 2199, action std: 0.50
mean steps: 17.277, mean reward: 8.369, reward rate: 0.484, rewarded fraction: 0.802, relative distance: 51.873, critic loss: 0.929, actor loss: 23.906
t: 230000, Ep: 2299, action std: 0.50
mean steps: 17.195, mean reward: 8.612, reward rate: 0.501, rewarded fraction: 0.832, relative distance: 49.515, critic loss: 0.927, actor loss: 24.099
t: 240000, Ep: 2399, action std: 0.50
mean steps: 17.308, mean reward: 8.660, reward rate: 0.500, rewarded fraction: 0.836, relative distance: 46.712, critic loss: 0.929, actor loss: 24.258
t: 250000, Ep: 2499, action std: 0.50
mean steps: 17.372, mean reward: 8.323, reward rate: 0.479, rewarded fraction: 0.792, relative distance: 49.679, critic loss: 0.921, actor loss: 24.381
t: 260000, Ep: 2599, action std: 0.50
mean steps: 

t: 650000, Ep: 6499, action std: 0.50
mean steps: 17.032, mean reward: 8.363, reward rate: 0.491, rewarded fraction: 0.799, relative distance: 48.476, critic loss: 0.967, actor loss: 25.040
t: 660000, Ep: 6599, action std: 0.50
mean steps: 16.981, mean reward: 8.541, reward rate: 0.503, rewarded fraction: 0.816, relative distance: 47.892, critic loss: 0.980, actor loss: 25.021
t: 670000, Ep: 6699, action std: 0.50
mean steps: 17.623, mean reward: 8.806, reward rate: 0.500, rewarded fraction: 0.851, relative distance: 46.313, critic loss: 0.970, actor loss: 25.019
t: 680000, Ep: 6799, action std: 0.50
mean steps: 17.495, mean reward: 8.560, reward rate: 0.489, rewarded fraction: 0.820, relative distance: 46.087, critic loss: 0.986, actor loss: 25.015
t: 690000, Ep: 6899, action std: 0.50
mean steps: 16.983, mean reward: 8.357, reward rate: 0.492, rewarded fraction: 0.792, relative distance: 47.743, critic loss: 0.956, actor loss: 25.007
t: 700000, Ep: 6999, action std: 0.50
mean steps: 