In [None]:
from env.balancebot_env import BalancebotEnv
from stable_baselines import PPO2
from stable_baselines.common.policies import ActorCriticPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.bench import Monitor
from stable_baselines.a2c.utils import linear

import os
import time
import torch
import tensorflow as tf
import numpy as np

In [None]:
log_dir = "/tmp/gym/{}".format(int(time.time()))
os.makedirs(log_dir, exist_ok=True)

In [None]:
# Create the environment
def make_env(rank):
    def _init():
        env = BalancebotEnv(render=False)
        env = Monitor(env, os.path.join(log_dir, str(rank)))
        return env
    return _init

num_cpu = 16
env = SubprocVecEnv([make_env(rank=i) for i in range(num_cpu)])


In [None]:
def const_linear(input_tensor, scope, np_w, np_b):
    in_shape = input_tensor.get_shape().as_list()
    n_input = in_shape[1]
    assert n_input == np_w.shape[0], \
        f"incompatible weight shape: {in_shape} and {np_w.shape}"
    n_output = np_w.shape[1]
    assert n_output == np_b.shape[0], \
        f'incompatible bias shape: {np_w.shape} and {np_b.shape}'
    with tf.variable_scope(scope):
        tf_w = tf.constant(np_w, name='w', dtype=input_tensor.dtype)
        tf_b = tf.constant(np_b, name='b', dtype=input_tensor.dtype)
        return tf.matmul(input_tensor, tf_w) + tf_b

In [None]:
# Create the RL Agwnt
class CustomPolicy(ActorCriticPolicy):
    np_pi_w = None
    np_pi_b = None
    np_vf_w = None
    np_vf_b = None
    var_layers = [16]
    
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False):
        super(CustomPolicy, self).__init__(sess, ob_space, 
                                           ac_space, n_env, n_steps, 
                                           n_batch, n_lstm=256,
                                           reuse=reuse, scale=False)
        if any([self.np_pi_w is None,
                self.np_pi_b is None,
                self.np_vf_w is None,
                self.np_vf_b is None]):
            raise ValueError('Run CustomPolicy.set_weights first before using this CustomPolicy')
        with tf.variable_scope('model', reuse=reuse):
            activ = tf.tanh
            processed_x = tf.layers.flatten(self.processed_x)
            pi_h = activ(const_linear(processed_x, 'pi_fc0', self.np_pi_w, self.np_pi_b))
            vf_h = activ(const_linear(processed_x, 'vf_fc0', self.np_vf_w, self.np_vf_b))
            for i, layer_size in enumerate(self.var_layers, 1):
                pi_h = activ(linear(pi_h, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2)))
                vf_h = activ(linear(vf_h, 'vf_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2)))
            value_fn = linear(vf_h, 'vf', 1)
            pi_latent = pi_h
            vf_latent = vf_h

        self.proba_distribution, self.policy, self.q_value = \
            self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01)

        self.value_fn = value_fn
        self.initial_state = None
        self._setup_init()
    
    def step(self, obs, state=None, mask=None, deterministic=False):
        if deterministic:
            action, value, neglogp = self.sess.run([self.deterministic_action, self._value, self.neglogp],
                                                   {self.obs_ph: obs})
        else:
            action, value, neglogp = self.sess.run([self.action, self._value, self.neglogp],
                                                   {self.obs_ph: obs})
        return action, value, self.initial_state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self._value, {self.obs_ph: obs})
    
    @classmethod
    def custom_set_weights(cls, np_pi_w, np_pi_b, np_vf_w, np_vf_b, var_layers=None):
        cls.np_pi_w = np_pi_w
        cls.np_pi_b = np_pi_b
        cls.np_vf_w = np_vf_w
        cls.np_vf_b = np_vf_b
        if var_layers:
            cls.var_layers = var_layers

In [None]:
# put your pytorch numpy weights here
CustomPolicy.custom_set_weights(np.random.rand(4, 32), np.random.rand(32),
                                np.random.rand(4, 32), np.random.rand(32))

In [None]:
model = PPO2(CustomPolicy, env, verbose=1, tensorboard_log=log_dir+"/tensorboard")

In [None]:
observation_dim = env.observation_space.shape[0]

In [None]:
weight_encoder = torch.rand(observation_dim, 32,requires_grad=False).numpy()

In [None]:
weight_encoder.shape

## How do I put the weight of encoder into model ?
## I want to put the weight into 'pi_fc0' and 'vf_fc0' and set them to untrainable
 

![Network](assets/network.png)

In [None]:
# Train and Save the agent
model.learn(total_timesteps=1e3, tb_log_name="PPO2")
# model.save("ppo_save")

In [None]:
log_dir+"/tensorboard/"

## You can open tensorboard at terminal
## For example:
### tensorboard --logdir log_dir+"/tensorboard"

In [None]:
# delete trained model to demonstrate loading
del model 

In [None]:
# Create the evaluation env
env = DummyVecEnv([lambda: BalancebotEnv(render=False)])

In [None]:
# Load the trained agent
model = PPO2.load("ppo_save", env=env, policy=CustomPolicy)

In [None]:

# Enjoy trained agent
for ep in range(10):
    obs = env.reset()
    dones = False
    while not dones:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)