In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os

import gymnasium as gym
import numpy as np

import matplotlib.pyplot as plt

import utils
import agents
import models
import datasets
import optimizers

In [None]:
utils.set_random_seed(42)

In [None]:
model_path = os.path.join(utils.MODEL_DIR, 'test.pt')
data_path = os.path.join(utils.EXP_DIR, 'test.pkl')

In [None]:
env_name = 'CartPole-v1'
env = gym.make(env_name)

In [None]:
n_obs_dims, n_act_dims, discrete = utils.get_env_dims(env)

collect random trajectories to train initial model

In [None]:
agent = agents.RandomAgent(env)

In [None]:
dataset = datasets.ExpDataset()

In [None]:
n_episodes = 10
for i in range(n_episodes):
    traj = utils.run_episode(env, agent, max_n_steps=1000, render=False)
    dataset.put(traj)
    rtn = sum(x[2] for x in traj)
    print(i, rtn)

train initial model

In [None]:
model = models.UDRLNeuralProcess(
    n_obs_dims=n_obs_dims,
    n_act_dims=n_act_dims,
    n_emb_dims=32,
    discrete=discrete,
    emb_layer_kwargs={'n_layers': 2, 'layer_size': 64},
    pred_layer_kwargs={'n_layers': 2, 'layer_size': 64}
)

In [None]:
optimizer = optimizers.UDRLNeuralProcessOptimizer(
    model,
    dataset,
    batch_size=256,
    opt_kwargs={'lr': 1e-3}
)

In [None]:
dataset.split()

In [None]:
model.reset()

In [None]:
optimizer.train(n_epochs=1, verbose=True)

train online

In [None]:
agent = agents.UDRLNeuralProcessAgent(
    model, 
    dataset, 
    optimizer, 
    train_freq=1,
    train_kwargs={'n_epochs': 1, 'verbose': False},
    warm_start=True,
    max_n_embs=1024
)

In [None]:
agent.training = True

In [None]:
n_episodes = 100000
for i in range(n_episodes):
    traj = utils.run_episode(env, agent, max_n_steps=1000, render=False)
    rtn = sum(x[2] for x in traj)
    print(i, rtn)

In [None]:
plt.title(env_name)
plt.xlabel('Number of Training Trajectories')
plt.ylabel('Return')
plt.plot(utils.smooth(dataset.traj_returns, win=10))
plt.show()