In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
!git clone https://github.com/rezer0dai/dlppoh
!git clone https://github.com/fgolemo/gym-ergojr
!git clone https://github.com/mahyaret/gym-panda

!pip install -e gym-ergojr
!pip install -e gym-panda
!pip install timebudget

In [None]:
import sys
sys.path.append("dlppoh")
sys.path.append("gym-ergojr")
sys.path.append("gym-panda")
sys.path.append("/content/dlppoh")
sys.path.append("/content/gym-ergojr")
sys.path.append("/content/gym-panda")

In [None]:
import torch
torch.set_default_dtype(torch.float32)

In [None]:
import numpy as np
import random, timebudget

In [None]:
import config

In [None]:
class Info:
    def __init__(self, states, rewards, actions, custom_rewards, dones, goals, goods, pi):
        self.states = states
        self.rewards = rewards
        self.actions = actions
        self.custom_rewards = custom_rewards
        self.dones = dones
        self.goals = goals
        self.goods = goods
        self.pi = pi

In [None]:
ones = lambda *shape: torch.ones(*shape)
zeros = lambda *shape: torch.zeros(*shape)

tensor = lambda x, shape: torch.tensor(x).view(*shape)

achieved_goals = lambda states: states[:, :config.CORE_ORIGINAL_GOAL_SIZE]

MOVE_DIST = 3e-4 
moved = lambda s1, s: MOVE_DIST < torch.norm(s1 - s)

def select_exp(states_1, states):
    if not config.SELECT_EXP:
        return [True] * len(states)
    return [moved(s1, s) for s1, s in zip(achieved_goals(states_1), achieved_goals(states))]

In [None]:
from tasks.oaiproc import GymGroup, GymRender

class LowLevelCtrlTask:
    def __init__(self, dock, prefix):
        self.ENV = GymGroup(config.TEST_ENVS, dock, config.TOTAL_ENV, prefix)
        self.RENDER = GymRender(config.ENV_NAME, config.TOTAL_ENV)
        
        self.goals = None
        self.info = None
        self.set_goods(False)

    def set_goods(self, goods):
        self.goods = goods

    def get_info(self):
        return self.info
        
    def set_goals(self, goals):
        self.goals = goals

    def get_goals(self):
        return self.goals

    def optimal_goals(self, base_states, end_states):
        base_states[:, -config.CORE_ORIGINAL_GOAL_SIZE:] = achieved_goals(self.info.states)
        dist, _, _ = self.hl_agent.exploit(
            end_states[:, :config.CORE_GOAL_SIZE],
            base_states,
            zeros(len(self.info.states), 1), 0)
        return dist.sample()
    
    # time feature is only for critic
    def append_time_feat(self, states):
        self.n_steps += 1
        if not config.TIMEFEAT:
            return states
        assert self.n_steps <  (1. + config.HRL_HIGH_STEP * config.HRL_STEP_COUNT)
        tf = ones(states.shape[0], 1) - (self.n_steps /  (1. + config.HRL_HIGH_STEP * config.HRL_STEP_COUNT))
        return torch.cat([states, tf], 1)

    def _state(self, 
            einfo, actions, pi,
            learn_mode=False, reset=False, seed=None):

        states, goals, rewards, dones = einfo

        states = self.append_time_feat(states)
        
        states = torch.cat([states, goals], 1)
        if config.CORE_GOAL_SIZE != config.CORE_ORIGINAL_GOAL_SIZE:
            goals = np.concatenate([goals, self.orig_pos], 1)

        goods = self.goods if self.goods is not None else [False for _ in range(len(states))] 
        rewards = tensor(rewards, [-1, 1])
        self.info = Info(
                states,
                rewards,
                actions,
# custom rewards here does not matter, all experience will be "dreamed" but based on true exp
                rewards,
                tensor(dones, [len(dones), -1]).float(),
                self.goals,
                goods,
                pi,
                )
        
        return self.info
    
    def reset(self, agent, seed, learn_mode):
        return self.info

    def internal_reset(self, agent, seed, learn_mode):
        self.hl_agent = agent
        self.n_steps = 0
        self.learn_mode = learn_mode

        if self.learn_mode:
            einfo = self.ENV.reset(seed)
        else:
            einfo = self.RENDER.reset(seed)
            
        self.set_goods(None)
    
        self._state(
            einfo, 
            None, None,
            learn_mode, True, seed)
        
        return einfo[1]
    
    def step(self, pi):
        if self.learn_mode:
            einfo = self.ENV.step(
                    pi[:, :pi.shape[1]//3].cpu().numpy(), ones(len(pi)))# if sum(self.goods) else None)
        else:
            einfo = self.RENDER.step(
                    pi[:, :pi.shape[1]//3].cpu().numpy())

        return self._state(einfo, pi[:, :config.ACTION_SIZE], pi, self.learn_mode, False)

    def goal_met(self, _total_reward, _last_reward):
        return False

In [None]:
from config import *
from timebudget import timebudget

from utils.her import HER, CreditAssignment

In [None]:
COUNTER = [0, 0]

SENTINEL = 2 # skip +1 last state dummy n_state it has, and last with valid n_state to use fast low level has +1 state we can throw when bit step_count ( 10 + ) and not waste too much experience
class ReacherHRL(HER):
    def __init__(self, cind, her_delay, gae, n_step, floating_step, gamma, gae_tau, her_select_ratio=.4, resampling=False, kstep_ir=False, clip=None):
        super().__init__(cind, her_delay, gae, n_step, floating_step, gamma, gae_tau, her_select_ratio, resampling, kstep_ir, clip)

    def _her_select_idx(self, n_steps):
        hers = [ i if random.random() < .5 else 0 for i, s in enumerate(n_steps[:-SENTINEL]) ]
        return hers + [0 for _ in range(SENTINEL)]

    @timebudget
    def update_goal(self, rewards, goals, states, states_1, n_goals, n_states, actions, her_step_inds, n_steps, allowed_mask):
        global COUNTER

        align = lambda x: x//config.HRL_STEP_COUNT*config.HRL_STEP_COUNT

        MAX_HER_STEP = 1

        h_goals = goals.clone()
        h_n_goals = n_goals.clone()
        h_rewards = rewards.clone()

        h_states = states.clone()
        h_n_states = n_states.clone()

        assert not allowed_mask[-1] # last goal will be not used!!
        allowed_mask[-2] = False # last state_1 should not be used too

        idxs = []
        her_goals = []
        her_states = []

        x = 0
        z = 0

        hers = []
        others = []

        COUNTER[sum(her_step_inds) > 0] += 1
        for i in range(HRL_STEP_COUNT, len(goals)-1, HRL_STEP_COUNT):
            if her_step_inds[i-1]:
                her_step_inds[i] = 0

#        assert 1 == HRL_STEP_COUNT or len(goals) - 1 == (len(goals) // HRL_STEP_COUNT) * HRL_STEP_COUNT

        assert not sum(her_step_inds[-2:])

        for j, (r, g, s, s2, n_g, n, u, step) in enumerate(zip(reversed(rewards), reversed(goals), reversed(states), reversed(states_1), reversed(n_goals), reversed(n_states), reversed(her_step_inds), reversed(n_steps))):
            if not step:
                continue

            i = len(goals) - 1 - j
            if i >= len(goals) - SENTINEL:
                continue

            her_active = her_step_inds[i+1]
#            assert her_active or not her_step_inds[i+step]
            if not her_active and her_step_inds[i+step]:
                allowed_mask[i] = False

            if not her_active and u:
                gro = random.randint(1, 1 + (len(goals) - i - step - SENTINEL) // HRL_STEP_COUNT)
                if random.random() < self.her_select_ratio:
                    gro = 1

            if her_active or u:
                if 1 == gro:
#                    assert i+1 == gid
                    h_rewards[i] = (config.REWARD_DELTA + torch.zeros(1, 1)) * config.REWARD_MAGNITUDE
                    x += 1
                    hers.append(i)
                else:
                    h_rewards[i] = (config.REWARD_DELTA - torch.ones(1, 1)) * config.REWARD_MAGNITUDE
                    z += 1
                    others.append(i)

#                if align(i) != align(step+i):
#                    print("===>", i, step, align(i), align(step+i), gro, align(step + i) + gro * config.HRL_STEP_COUNT)
#                assert align(step + i) + gro * config.HRL_STEP_COUNT <= 100

                hg = [align(i) + gro * config.HRL_STEP_COUNT, align(step + i) + gro * config.HRL_STEP_COUNT]
                hs = [align(i), align(step+i)]

                idxs.append(i)
                her_goals.extend(hg)
                her_states.extend(hs)

            else:
                others.append(i)

        allowed = allowed_mask[idxs]
        allowed_mask[...] = False # rest we dont know if good or not
        allowed_mask[idxs] = allowed

#        mask = np.zeros(len(n_steps))
#        mask[idxs] = 1.
#        if sum(her_step_inds): print("\nHUH", np.concatenate([np.asarray(n_steps).reshape(-1, 1), np.asarray(her_step_inds).reshape(-1, 1), mask.reshape(-1, 1)], 1))

        if len(hers):

            h_states[idxs, -config.CORE_ORIGINAL_GOAL_SIZE:] = states[her_goals[0::2], :config.CORE_ORIGINAL_GOAL_SIZE].clone()
            h_n_states[idxs, -config.CORE_ORIGINAL_GOAL_SIZE:] = states[her_goals[1::2], :config.CORE_ORIGINAL_GOAL_SIZE].clone()

            her_states_t = h_states[her_states].view(len(her_states), -1).clone()
#            assert her_states_t[1::2, -config.CORE_ORIGINAL_GOAL_SIZE:].shape == states[her_goals[1::2], :config.CORE_ORIGINAL_GOAL_SIZE].clone().shape
            her_states_t[1::2, -config.CORE_ORIGINAL_GOAL_SIZE:] = states[her_goals[1::2], :config.CORE_ORIGINAL_GOAL_SIZE].clone()

            if config.TIMEFEAT:
                if config.TF_LOW:
                    if not config.NORMALIZE: # TODO this is just temporarerly
                        pass#assert False
                        #her_states_t[:, -1 - config.CORE_ORIGINAL_GOAL_SIZE] = 1.
                else:
                    assert False
                    her_states_t[:, -1 - config.CORE_ORIGINAL_GOAL_SIZE] = (1. - (torch.tensor(her_states) /  (1.+config.HRL_HIGH_STEP)))# * .1

            dist, _, _ = config.AGENT[1].exploit(
                    h_states[her_goals, :CORE_GOAL_SIZE].view(len(her_goals), -1),
                    her_states_t,
                    torch.zeros(len(her_goals), 1), 0)
            her_hers = dist.sample().view(len(her_goals), -1)




            dist, _, _ = config.AGENT[0].exploit(
                    her_hers,
                    her_states_t,
                    torch.zeros(len(her_goals), 1), 0)
            bool_inds = (dist.log_prob(actions[her_states]).mean(1) < -1.)
            hi = torch.tensor(her_goals)[bool_inds][::2]
            if len(hi) * 3 >= len(idxs) * 2: print("DISBANDED {} vs {}".format(len(hi), len(idxs)))
            allowed_mask[hi] = False




            h_goals[idxs] = her_hers[0::2].float()
            h_n_goals[idxs] = her_hers[1::2].float()

#        print("\n ->>>>", sum(allowed_mask))

        return ( h_rewards, h_goals, h_states, h_n_goals, h_n_states, allowed_mask )

In [None]:
from head import install_lowlevel
from torch.distributions import Normal
from utils.schedule import LinearSchedule

class HighLevelCtrlTask:
    def __init__(self, dock, prefix):
        self.ll_ctrl = LowLevelCtrlTask(dock, prefix)
        self.ll_env, self.ll_task = install_lowlevel(self.ll_ctrl, ReacherHRL)

        self.lowlevel = None
        
        self.ls = LinearSchedule(.1, .8, config.HRL_HINDSIGHTACTION_HORIZON)
        self.ctrl = None

        #DEBUG
        self.total = 0
        self.probed = 0

        self.goals = None
        self.goods = None

    def get_goals(self):
        return self.goals

    def _state(self, 
            einfo, base_states, rewards, actions, goods, pi,
            learn_mode=False, reset=False, seed=None):

        states = einfo.states.clone()

        self.goods = [(g0 or g1) for g0, g1 in zip(self.goods, goods)] if goods is not None else [False for _ in range(len(states))]
        
        info = Info(
                states,
                rewards,
                actions,
# custom rewards shape                    
                (einfo.rewards + config.REWARD_DELTA) * config.REWARD_MAGNITUDE,
                einfo.dones,
                self.goals,
                goods,
                pi,
                )

        self.info = info
        return info

    def _finish_ep(self):
        if self.lowlevel is None:
            return
        # allow to learn only from eps where we moved
        print("\n ep selection --->", sum(self.goods))
        self.ll_ctrl.set_goods(self.goods)
        for _ in self.lowlevel:
            print("DO FINISH")

    def reset(self, agent, seed, learn_mode):
        self.learn_mode = learn_mode
        
        timebudget.report(reset=True)
        
        print("\nstats : ", self.probed, self.total, self.ls.c, (self.ls.get_ls() - 1.), learn_mode)
        self.total = 0
        self.probed = 0

        self._finish_ep()

        self.lowlevel = self.ll_env.step(
            self.ll_task, seed, config.HRL_STEP_COUNT) if learn_mode else self.ll_env.evaluate(
            self.ll_task, config.HRL_STEP_COUNT)

        self.goals = self.ll_ctrl.internal_reset(agent, seed, learn_mode)

        return self._state(
            self.ll_ctrl.get_info(), 
            None, None, None, None, None,
            learn_mode=learn_mode, reset=True, seed=seed)
    
    def step(self, pi):
        a = pi[:, :pi.shape[1]//3].clone()

        self.ll_ctrl.set_goals(a.clone())
        base_states = self.ll_ctrl.get_info().states.clone()

        (log_prob, _, _, _, _, ll_actions, _, good), acu_reward = next(self.lowlevel)
        
        next_states = self.ll_ctrl.get_info().states.clone()
        
        goods = select_exp(base_states, next_states)
        actions = self.proximaly_close_actions(a, pi, base_states, next_states, goods)

        self.einfo = self._state(
            self.ll_ctrl.get_info(), 
            base_states,
            acu_reward, actions, goods, 
            pi,
            self.learn_mode, reset=False)

        self.einfo.pi[:, actions.shape[-1]:actions.shape[-1]+ll_actions.shape[-1]*2] = torch.cat([ # TODO[ : KICK OFF
            log_prob,#torch.ones_like(log_prob ),
            ll_actions], 1)#torch.ones_like(actionsZ) ], 1)

        return self.einfo
    
    def proximaly_close_actions(self, a, actions, base_states, next_states, goods):
        self.total += len(a)
        
        if not self.learn_mode:
            return a
        
        pi = Normal(actions[:, a.shape[1]: a.shape[1]*2], actions[:, a.shape[1]*2:])
        og = self.ll_ctrl.optimal_goals(base_states, next_states)

        baseline = pi.log_prob(a).mean(1) * (1. + 1. - self.ls.get_ls())

        idx = pi.log_prob(og).mean(1) > baseline
        if not sum(idx):
            return a
        
        a[idx > 0] = og[idx > 0]
        self.probed += sum(idx)

        if sum(idx) * 2 > len(og) and sum(goods):
            return a
        
        self.ls()    
        return a

In [None]:
timebudget.set_quiet()
import time
env_start = time.time()
env_counter = 0

import hashlib
algo = hashlib.md5()
algo.update(open("config.py", "rb").read())
import logging

LOG_NAME = "stat_"+config.PREFIX+"_"+algo.hexdigest()+"%i"%time.time()
def log_info():
    return LOG_NAME

logging.basicConfig(filename=LOG_NAME, level=logging.DEBUG)
logging.info(open("config.py", "r").read())
def callback(bot, task, test_scores, learn_scores, seeds, total):
    global env_start, env_counter
    env_counter += 1
    if test_scores is None:
        return
    msg = ("\n\t [", env_counter, "] < %.2f"%((time.time()-env_start) / 60), "min > TEST ==> ", test_scores, "exploring score:", learn_scores.mean())
    logging.info(msg)
    print(*msg)
    print("debug info ->", LOG_NAME)
    if env_counter > config.TOTAL_ROUNDS:
        exit()
        import os
        os.system("sh looper.sh")

from head import install_highlevel
#from dlhppo import HighLevelCtrlTask

print("\nSTART$log is in : ", log_info())

KEYID = config.PREFIX+"_hl"
high_level_task = HighLevelCtrlTask("mujoco_dock", KEYID)

env, task = install_highlevel(high_level_task, KEYID)
scores = env.start(task, callback)

print("\nDONE$log is in : ", log_info())

In [None]:
for _ in env.evaluate(task, None):
    pass

In [None]:
!cat stat__v2_1010_Reacher_OWNSTUFF_6e25cc3b03badf144c111d3ae287c6cd1624905322