[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rezer0dai/TD3_BC/blob/her/td3_bc_her.ipynb)

In [None]:
!git clone https://github.com/rezer0dai/TD3_BC -b her
#!git clone https://github.com/sfujim/TD3
    
!git clone https://github.com/qgallouedec/panda-gym

!pip install -e panda-gym

In [1]:
import sys

libs = ["TD3_BC", "TD3", "panda-gym"]
for lib in libs:
    sys.path.append(lib)
    sys.path.append("/content/"+lib)

In [2]:
import numpy as np
import torch
import gym
import argparse
import os

import utils
import TD3_BC
#import TD3
import config

In [3]:
def eval_policy(policy, eval_env, seed, normalize_state, seed_offset=100, eval_episodes=10):
    load_state = lambda obs: obs["observation"].reshape(1,-1)

    eval_env.seed(seed + seed_offset)

    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            state = load_state(state)
            action = policy.select_action(normalize_state(state))
            state, reward, done, _ = eval_env.step(action)
            avg_reward += reward

    avg_reward /= eval_episodes

    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    print("---------------------------------------")
    return avg_reward


In [4]:
import random
def her(replay_buffer, achieved_goals):
    ep = []
    i = replay_buffer.ptr

    if not i:
        return

    while replay_buffer.not_done[i-len(ep)-2]:
        ep.append([
            replay_buffer.state[i-len(ep)-1].copy(), 
            replay_buffer.action[i-len(ep)-1].copy(), 
            replay_buffer.next_state[i-len(ep)-1].copy(), 
            replay_buffer.reward[i-len(ep)-1], 0 == len(ep)
            ])

    replay_buffer.ptr -= len(ep)
    replay_buffer.size -= len(ep)
    
    for _ in range(config.HER_PER_EP):
        ep_ = []
        for j, e in enumerate(ep[1:]):
            s, a, n, r, d = e
            s, n = s.copy(), n.copy()
            s[-config.GOAL_SIZE:] = n[-config.GOAL_SIZE:] = random.choice(achieved_goals[:j+1]).copy()
            r = -1. * (np.linalg.norm(n[:config.GOAL_SIZE] - s[-config.GOAL_SIZE:]) > .05)
            ep_.append([s, a, n, r, d])

        for e in ep_:
            replay_buffer.add(*e)

        if random.random() < config.HER_RATIO:
            continue
        for e in ep_:
            replay_buffer.add(*e)

    replay_buffer.add(*(ep_[0][:-1]), True)

In [None]:
import random
from open_gym import make_env

if True:#__name__ == "__main__":
    file_name = f"{config.ENV}_{config.SEED}"
    print("---------------------------------------")
    print(f"Policy: , Env: {config.ENV}, Seed: {config.SEED}")
    print("---------------------------------------")

    env = make_env(config.ENV, render=False, colab=True)
    eval_env = make_env(config.ENV, render=True, colab=True)

    # Set seeds
    env.seed(config.SEED)
    env.action_space.seed(config.SEED)
    torch.manual_seed(config.SEED)
    np.random.seed(config.SEED)
    
    state_dim = env.state_size()
    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])

    kwargs = {
            "state_dim": state_dim,
            "action_dim": action_dim,
            "max_action": max_action,
            "discount": config.DISCOUNT,
            "tau": config.TAU,
            # TD3
            "policy_noise": config.POLICY_NOISE * max_action,
            "noise_clip": config.NOISE_CLIP * max_action,
            "policy_freq": config.POLICY_FREQ,
            # TD3 + BC
            "alpha": config.ALPHA
    }

    # Initialize policy
    #policy = TD3.TD3(**kwargs)
    policy = TD3_BC.TD3_BC(**kwargs)

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    print("---------------------------------------")
    print(f"Policy TD3+BC+HER: , Env: {config.ENV}, Seed: {config.SEED}, Observation shape: {state_dim}")
    print("---------------------------------------")

    load_state = lambda obs: obs["observation"].reshape(1,-1)

    done = True
    reward = -1.
    total_steps = config.STEPS_PER_EPOCH * config.EPOCHS
    for t in range(total_steps):

        if done:
            if t: her(replay_buffer, achieved_goals)
            achieved_goals = []
            mc_w = random.randint(1, 5) if random.random() > .5 else 0.
            state = load_state(env.reset())

        if t < config.START_STEPS or (-1. == reward and random.random() < mc_w / 10.):
            action = env.action_space.sample()
        else:
            action = policy.select_action(replay_buffer.normalize_state(state))

        observation, reward, done, _ = env.step(action)
        next_state = load_state(observation)
        achieved_goals.append(observation["achieved_goal"])

        replay_buffer.add(state, action, next_state, reward, done)

        state = next_state

        if t > config.UPDATE_AFTER and 0 == t % config.UPDATE_EVERY:
            for j in range(config.UPDATE_COUNT):
                policy.train(replay_buffer, config.BATCH_SIZE)
            policy.polyak()

        score = -100
        
        # Evaluate episode
        if (t + 1) % config.EVAL_FREQ == 0:
            print(f"Epochs : {(t + 1) % config.EVAL_FREQ}")
            score = eval_policy(policy, eval_env, config.SEED, replay_buffer.normalize_state)
        if score > -10.:
            break

---------------------------------------
Policy: , Env: panda-pusher, Seed: 0
---------------------------------------




---------------------------------------
Policy TD3+BC+HER: , Env: panda-pusher, Seed: 0, Observation shape: 30
---------------------------------------


In [None]:
[ eval_policy(policy, eval_env, random.randint(0, 100000), replay_buffer.normalize_state) for _ in range(100) ]