In [1]:
import numpy as np
import config.config as config
from Environment import Env
from Agent_LSTM import *
from pathlib import Path
import pandas as pd

import sys; sys.path.append('../analysis/')
from my_utils import reset_seeds

In [2]:
def validation(datapath, seed_number, Actor, Critic, agent_name, logfile, epi, value_noise_std, total_epi=300):     
    # get configures
    arg = config.ConfigGain(datapath)
    arg.SEED_NUMBER = seed_number
    arg.device = 'cpu'
    
    # reproducibility
    reset_seeds(arg.SEED_NUMBER)

    # initialize environment and agent
    env = Env(arg)
    agent = Agent(arg, Actor, Critic)
    agent.actor.value_noise_std = value_noise_std
    agent.load(agent_name, load_memory=False, load_optimzer=False)
    
    # Loop now
    reward_log = []
    rewarded_trial_log = []
    step_log = []
    dist_log = []
    skipped_log = []
    tot_t = 0

    # Start loop
    for _ in range(total_epi):
        # initialize a trial
        cross_start_threshold = False
        x = env.reset()
        agent.bstep.reset(env.pro_gains)
        
        last_action = torch.zeros(1, 1, arg.ACTION_DIM)
        last_action_raw = last_action.clone()

        state = torch.cat([x[-arg.OBS_DIM:].view(1, 1, -1), last_action,
                           env.target_position_obs.view(1, 1, -1), 
                           torch.zeros(1, 1, 1)], dim=2).to(arg.device)

        hiddenin = None
        tend = 0

        for t in range(30):
            # 1. Check start threshold.
            if not cross_start_threshold and (last_action_raw.abs() > arg.TERMINAL_ACTION).any():
                cross_start_threshold = True

            # 2. Take an action based on current state 
            # and previous hidden & cell states of LSTM units.
            action, action_raw, hiddenout = agent.select_action(state, hiddenin, action_noise=None)

            # 3. Track next x in the environment.
            next_x, reached_target, relative_dist = env(x, action, t - tend)

            # 4. Next observation given next x.
            next_ox = agent.bstep(next_x)
            next_state = torch.cat([next_ox.view(1, 1, -1), action,
                                    env.target_position_obs.view(1, 1, -1),
                                    torch.ones(1, 1, 1) * (t - tend + 1)], dim=2).to(arg.device)

            # 5. Check whether stop.
            is_stop = env.is_stop(x, action)

            # 6. Give reward if stopped.          
            if is_stop and cross_start_threshold:
                reward = env.return_reward(x, reward_mode='mixed')
            else:
                reward = torch.zeros(1, 1, 1)

            # 8. Update timestep.
            last_action_raw = action_raw
            last_action = action
            state = next_state
            x = next_x
            hiddenin = hiddenout

            # 10. whether break.
            if is_stop and cross_start_threshold:
                break

        step_log.append(t + 1 - tend)
        reward_log.append(reward.item())
        rewarded_trial_log.append(int(reached_target & is_stop))
        dist_log.append(relative_dist.item())
        skipped_log.append(np.linalg.norm(x[:2]) < np.linalg.norm(env.target_position) * 0.3)
        tot_t += t


    print(f"episode: {epi}, "
          f"mean steps: {np.mean(step_log):0.3f}, "
          f"mean reward: {np.mean(reward_log):0.3f}, "
          f"reward rate: {np.sum(rewarded_trial_log) / (tot_t * arg.DT):0.3f}, "
          f"rewarded fraction: {np.sum(rewarded_trial_log) / total_epi:0.3f}, "
          f"relative distance: {np.mean(dist_log) * arg.LINEAR_SCALE:0.3f}, "
          f"skipped fraction: {np.sum(skipped_log) / total_epi:0.3f}")
    
    return logfile.append(
                        pd.DataFrame({'episode': [epi],
                                      'reward_fraction': [np.sum(rewarded_trial_log) / total_epi], 
                                      'error_distance': [np.mean(dist_log) * arg.LINEAR_SCALE],
                                      'reward_rate': [np.sum(rewarded_trial_log) / (tot_t * arg.DT)],
                                      'skipped fraction': [np.sum(skipped_log) / total_epi]}), 
                                      ignore_index=True)
        

In [3]:
actors = ['Actor_novalue']
critics = ['Critic']
seeds = [[19,20]]
TOTAL_EPISODE = int(1e4)
folder_path = Path('D:/quitting_data/agents')

value_noise_std = 0
agent_type = 'no_freeze'

In [4]:
for actor, critic, seed_ in zip(actors, critics, seeds):
    for seed in seed_:
        datapath = folder_path / f'{actor}{critic}' / f'seed{seed}' / f'{agent_type}'
        exec(f'from {actor} import *'); exec(f'from {critic} import *')
        logfile = pd.DataFrame(columns=['episode', 'reward_fraction', 'error_distance', 'reward_rate'])
        agent_name = [v.stem.split('_')[0] for v in datapath.glob('*.pkl')][0]
        #pre_epis = [int(v.name.split('_')[0].split('-')[-1]) for v in datapath.glob('*_pre.pth.tar')]
        #last_pre_epi = np.sort(pre_epis)[-1]
        #logfile = validation(datapath, seed, Actor, Critic, f'{agent_name}-{last_pre_epi}_pre', logfile, 0, value_noise_std)
        for epi in np.arange(99, int(TOTAL_EPISODE), 100):
            logfile = validation(datapath, seed, Actor, Critic, f'{agent_name}-{epi}', logfile, epi, value_noise_std)
        logfile.to_csv(datapath / f'{agent_name}.csv', index=False)

episode: 99, mean steps: 15.283, mean reward: 9.558, reward rate: 0.660, rewarded fraction: 0.943, relative distance: 35.794, skipped fraction: 0.000
episode: 199, mean steps: 15.283, mean reward: 9.558, reward rate: 0.660, rewarded fraction: 0.943, relative distance: 35.794, skipped fraction: 0.000
episode: 299, mean steps: 15.283, mean reward: 9.558, reward rate: 0.660, rewarded fraction: 0.943, relative distance: 35.794, skipped fraction: 0.000
episode: 399, mean steps: 15.283, mean reward: 9.558, reward rate: 0.660, rewarded fraction: 0.943, relative distance: 35.794, skipped fraction: 0.000
episode: 499, mean steps: 14.457, mean reward: 9.203, reward rate: 0.666, rewarded fraction: 0.897, relative distance: 40.932, skipped fraction: 0.000
episode: 599, mean steps: 14.067, mean reward: 8.643, reward rate: 0.635, rewarded fraction: 0.830, relative distance: 46.629, skipped fraction: 0.000
episode: 699, mean steps: 17.647, mean reward: 8.284, reward rate: 0.481, rewarded fraction: 0.

episode: 5499, mean steps: 15.103, mean reward: 9.387, reward rate: 0.652, rewarded fraction: 0.920, relative distance: 35.486, skipped fraction: 0.000
episode: 5599, mean steps: 14.657, mean reward: 9.358, reward rate: 0.671, rewarded fraction: 0.917, relative distance: 37.298, skipped fraction: 0.000
episode: 5699, mean steps: 14.660, mean reward: 9.262, reward rate: 0.661, rewarded fraction: 0.903, relative distance: 39.307, skipped fraction: 0.000
episode: 5799, mean steps: 15.027, mean reward: 9.228, reward rate: 0.642, rewarded fraction: 0.900, relative distance: 39.559, skipped fraction: 0.000
episode: 5899, mean steps: 15.100, mean reward: 9.527, reward rate: 0.664, rewarded fraction: 0.937, relative distance: 37.632, skipped fraction: 0.000
episode: 5999, mean steps: 14.310, mean reward: 9.316, reward rate: 0.686, rewarded fraction: 0.913, relative distance: 39.814, skipped fraction: 0.000
episode: 6099, mean steps: 14.853, mean reward: 9.600, reward rate: 0.683, rewarded frac

episode: 899, mean steps: 14.317, mean reward: 8.451, reward rate: 0.603, rewarded fraction: 0.803, relative distance: 46.974, skipped fraction: 0.000
episode: 999, mean steps: 14.573, mean reward: 8.925, reward rate: 0.634, rewarded fraction: 0.860, relative distance: 44.175, skipped fraction: 0.000
episode: 1099, mean steps: 14.433, mean reward: 8.922, reward rate: 0.640, rewarded fraction: 0.860, relative distance: 42.488, skipped fraction: 0.000
episode: 1199, mean steps: 14.550, mean reward: 8.943, reward rate: 0.637, rewarded fraction: 0.863, relative distance: 39.970, skipped fraction: 0.000
episode: 1299, mean steps: 15.023, mean reward: 9.378, reward rate: 0.656, rewarded fraction: 0.920, relative distance: 40.762, skipped fraction: 0.000
episode: 1399, mean steps: 14.860, mean reward: 8.945, reward rate: 0.623, rewarded fraction: 0.863, relative distance: 41.128, skipped fraction: 0.000
episode: 1499, mean steps: 15.337, mean reward: 9.053, reward rate: 0.611, rewarded fracti

episode: 6299, mean steps: 15.107, mean reward: 9.328, reward rate: 0.647, rewarded fraction: 0.913, relative distance: 36.359, skipped fraction: 0.000
episode: 6399, mean steps: 15.300, mean reward: 9.490, reward rate: 0.653, rewarded fraction: 0.933, relative distance: 36.879, skipped fraction: 0.000
episode: 6499, mean steps: 15.003, mean reward: 9.341, reward rate: 0.652, rewarded fraction: 0.913, relative distance: 36.867, skipped fraction: 0.000
episode: 6599, mean steps: 15.553, mean reward: 9.568, reward rate: 0.648, rewarded fraction: 0.943, relative distance: 36.462, skipped fraction: 0.000
episode: 6699, mean steps: 14.670, mean reward: 9.492, reward rate: 0.683, rewarded fraction: 0.933, relative distance: 34.077, skipped fraction: 0.000
episode: 6799, mean steps: 15.193, mean reward: 9.626, reward rate: 0.669, rewarded fraction: 0.950, relative distance: 34.536, skipped fraction: 0.000
episode: 6899, mean steps: 14.753, mean reward: 9.391, reward rate: 0.669, rewarded frac