In [3]:
%load_ext autoreload
%autoreload 2
%tb

NameError: name 'tf' is not defined

In [4]:
import pickle
import time

import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, InputLayer
from tf2rl.algos.sac_discrete import SACDiscrete

from CustomTrainer import Trainer
from VariationalAutoEncoder import VAE

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [90]:
parser = Trainer.get_argument()
parser.add_argument("-f")
parser = SACDiscrete.get_argument(parser)
parser.set_defaults(test_interval=2000)
parser.set_defaults(max_steps=6000)
parser.set_defaults(gpu=-1)
parser.set_defaults(n_warmup=500)
parser.set_defaults(batch_size=32)
parser.set_defaults(memory_capacity=int(6000))
parser.set_defaults(test_episodes=int(25))
args = parser.parse_args()

env = gym.make("CartPole-v0")
test_env = gym.make("CartPole-v0")
policy = SACDiscrete(
    state_shape=env.observation_space.shape,
    action_dim=env.action_space.n,
    discount=0.99,
    gpu=args.gpu,
    memory_capacity=args.memory_capacity,
    batch_size=args.batch_size,
    n_warmup=args.n_warmup)
trainer = Trainer(policy, env, args, test_env=test_env)

output_dir = trainer._output_dir

In [6]:
#uncomment to get initial dataset

# trainer()
# pickle.dump(trainer.replay_buffer, open("./CARTPOLE_SAC_6000_RB2.pkl", "wb"))

AttributeError: 'Trainer' object has no attribute 'replay_buffer'

In [13]:
def load_dataset(path):
    replay_buffer = pickle.load(open(path, "rb"))
    test = replay_buffer.transform(replay_buffer.storage)
    trb = test
    trb = np.concatenate((trb["obs"], trb["act"], trb["next_obs"], trb["rew"], trb["done"]), axis=1)
    ds = tf.data.Dataset.from_tensor_slices(trb).shuffle(trb.shape[0]).batch(32)
    return ds,trb, replay_buffer

In [14]:
train_dataset, train_ori, replay_buffer = load_dataset("./CARTPOLE_SAC_6000_RB.pkl")
test_dataset, test_ori, _ = load_dataset("./CARTPOLE_SAC_6000_RB2.pkl")

In [15]:
ds_shape = train_dataset.element_spec.shape

In [72]:

latent_dim = 3
kl_importance = 0.01

encoder_layers = [64, 64, 128, 256]
decoder_layers = encoder_layers[::-1]

In [73]:
encoder = tf.keras.Sequential([
    InputLayer(input_shape=ds_shape[1:]),
    *[Dense(x) for x in encoder_layers],
    Dense(2*latent_dim),
])

In [74]:
decoder = tf.keras.Sequential([
    InputLayer(input_shape=(latent_dim,)),
    *[Dense(x) for x in decoder_layers],
    Dense(ds_shape[-1])
])

In [75]:
vae = VAE(latent_dim, lambda *args: encoder, lambda *args: decoder, kl_importance=kl_importance)

In [76]:
from IPython import display
epochs = 100
writer = tf.summary.create_file_writer(f"{output_dir} {latent_dim} {kl_importance}")
writer.set_as_default()
for epoch in range(1, epochs + 1):
  start_time = time.time()
  for train_x in train_dataset:
    vae.compute_apply_gradients(train_x)
  end_time = time.time()
  tf.summary.scalar("VAE/train/r_loss", vae.r_loss.result(), step=epoch)
  tf.summary.scalar("VAE/train/kl_loss", vae.kl_loss.result(), step=epoch)
  tf.summary.scalar("VAE/train/total_loss", vae.total_loss.result(), step=epoch)
  vae.reset_metrics()

  if epoch % 1 == 0:
    loss = tf.keras.metrics.Mean("validation_loss")    
    for test_x in test_dataset:
        loss(vae.compute_loss(test_x))
    tf.summary.scalar("VAE/test/r_loss", vae.r_loss.result(), step=epoch)
    tf.summary.scalar("VAE/test/kl_loss", vae.kl_loss.result(), step=epoch)
    tf.summary.scalar("VAE/test/total_loss", vae.total_loss.result(), step=epoch)
    elbo = -loss.result()
    display.clear_output(wait=False)
    print('Epoch: {}, Test set ELBO: {}, '
          'time elapse for current epoch {}'.format(epoch,
                                                    elbo,
                                                    end_time - start_time))
    vae.reset_metrics()
    

Epoch: 100, Test set ELBO: -0.15323780477046967, time elapse for current epoch 0.392916202545166


In [82]:
def get_next_state(state, action, env):
    env.reset()    
    env.state = state
    ns, reward, done, _ = env.step(action)
    return ns, reward, done

def m2e(a,b):
    return ((a-b)**2).mean(axis=0)


def parse_output_vae(o_vae):
    o_vae = o_vae.numpy()
    state = o_vae[:,0:4]
    action = np.round(np.clip(o_vae[:,4],0,1)).astype("int32").reshape((-1, 1))
    next_obs = o_vae[:,5:9]
    rew = o_vae[:,9].reshape((-1, 1))
    done = np.round(np.clip(o_vae[:,10], 0, 1)).astype("int32").reshape((-1, 1))
    return {"obs":state, "act":action, "next_obs": next_obs, "rew": rew, "done":done}

In [83]:
def validate_correctness(ds, env):    
    mean, logvar = vae.encode(ds)
    eps = vae.reparameterize(mean, logvar)
    decoded = vae.decode(eps)  
    
    state, action, next_obs, rew, done = parse_output_vae(decoded).values() 
    
    n = len(state)
    obs_err = 0
    rew_err = 0
    done_err = 0
    
    for i in range(n):
        cns, crew, cdone = get_next_state(state[i], action[i][0], env)   
    
        obs_err += m2e(cns, next_obs)
        rew_err += m2e(rew, crew)
        done_err += m2e(done, cdone)
    
    return obs_err/n, rew_err/n, done_err/n



In [84]:
validate_correctness(test_ori, env)

(array([0.11263909, 0.33045487, 0.00410763, 0.40680213]),
 array([1.7419483e-05], dtype=float32),
 array([0.]))

In [85]:
replaytrainer = Trainer(policy, env, args, test_env=test_env, \
                        custom_replay_buffer=replay_buffer, retrieve_new_exp=False)

In [86]:
replaytrainer()



00:31:35.862 [INFO] (CustomTrainer.py:146) Evaluation Total Steps:    2000 Average Reward  82.7600 over  25 episodes


KeyboardInterrupt: 

In [94]:
from util import VAEReplayJoiner

def vae_replay_buffer(samples):
    o_vae = vae.sample(tf.random.normal(shape=(samples, vae.latent_dim)))
    return parse_output_vae(o_vae)

vaeandreplay = VAEReplayJoiner(vae_replay_buffer, replay_buffer, 0.5)

In [95]:
replaytrainer = Trainer(policy, env, args, test_env=test_env, \
                        custom_replay_buffer=vaeandreplay, retrieve_new_exp=False)


ValueError: Expected `model` argument to be a functional `Model` instance, but got a subclass model instead.

In [96]:
replaytrainer()


00:38:12.764 [INFO] (CustomTrainer.py:146) Evaluation Total Steps:    2000 Average Reward  9.2000 over  25 episodes
00:38:25.586 [INFO] (CustomTrainer.py:146) Evaluation Total Steps:    4000 Average Reward  9.0800 over  25 episodes
00:38:38.351 [INFO] (CustomTrainer.py:146) Evaluation Total Steps:    6000 Average Reward  9.2000 over  25 episodes


In [97]:
class Random(tf.keras.Model):
    def __init__(self, env):
        self.env = env
        self.policy_name = "Random"
        self.memory_capacity = args.memory_capacity
        self.n_warmup=args.n_warmup
        self.update_interval = 5000
        self.batch_size = args.batch_size
        
    def get_action(self, *args, **kwargs):
        return self.env.action_space.sample()
    
    def train(self, *args, **kwargs):
        pass


In [98]:
replaytrainer = Trainer(Random(env), env, args, test_env=test_env)

In [99]:
replaytrainer()



21:29:56.126 [INFO] (CustomTrainer.py:111) Total Epi:     1 Steps:      12 Episode Steps:    12 Return:  13.0000 FPS: 25118.84
21:29:56.129 [INFO] (CustomTrainer.py:111) Total Epi:     2 Steps:      57 Episode Steps:    45 Return:  45.0000 FPS: 42464.57
21:29:56.131 [INFO] (CustomTrainer.py:111) Total Epi:     3 Steps:      89 Episode Steps:    32 Return:  32.0000 FPS: 44977.36
21:29:56.133 [INFO] (CustomTrainer.py:111) Total Epi:     4 Steps:     163 Episode Steps:    74 Return:  74.0000 FPS: 49207.82
21:29:56.135 [INFO] (CustomTrainer.py:111) Total Epi:     5 Steps:     192 Episode Steps:    29 Return:  29.0000 FPS: 45068.29
21:29:56.137 [INFO] (CustomTrainer.py:111) Total Epi:     6 Steps:     221 Episode Steps:    29 Return:  29.0000 FPS: 34626.54
21:29:56.139 [INFO] (CustomTrainer.py:111) Total Epi:     7 Steps:     236 Episode Steps:    15 Return:  15.0000 FPS: 30207.89
21:29:56.141 [INFO] (CustomTrainer.py:111) Total Epi:     8 Steps:     265 Episode Steps:    29 Return:  29.000