In [2]:
import minari
import minari.dataset
import numpy as np
import os
import gym
import glfw
import torch
from collections import OrderedDict
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from copy import copy

In [3]:
def get_ret_to_go(episode, max_score):
    ret_to_go = copy(episode.rewards)
    for i in range(1, len(ret_to_go)):
        ret_to_go[:-i] += episode.rewards[i:]
    ret_to_go /= max_score # normalize
    return ret_to_go

class HistCNNDataset(torch.utils.data.Dataset):
    def __init__(self, dataset: minari.MinariDataset, max_score, tdim):
        self.tdim = tdim
        self.max_score = max_score
        self.m_dataset = dataset
        print("Observation Shape: ", dataset.spec.observation_space.shape)
        print("Actions Shape: ", dataset.spec.action_space.shape)
        print("RTG Shape: ", 1)
        print("Net feed shape: ", 1 + dataset.spec.observation_space.shape[0] + dataset.spec.action_space.shape[0])

        observations = []
        actions = []

        for episode in dataset.iterate_episodes():
            observations.append(episode.observations)
            actions.append(episode.actions)

        observations = np.concatenate(observations)
        actions = np.concatenate(actions)
        self.max_obs = np.max(observations, axis=0, keepdims=True)
        self.max_act = np.max(actions, axis=0, keepdims=True)
        self.min_obs = np.min(observations, axis=0, keepdims=True)
        self.min_act = np.min(actions, axis=0, keepdims=True)
        print("Max obs: ", self.max_obs)
        print("Min obs: ", self.min_obs)
        print("Max act: ", self.max_act)
        print("Min act: ", self.min_act)

        obs_dim = dataset.spec.observation_space.shape[0]
        self.action_dim = dataset.spec.action_space.shape[0]
        ret_dim = 1
        self.net_feed_dim = obs_dim + self.action_dim + ret_dim

        self.lengths = []
        self.ixds = []
        self.pp_dataset = []
        self.labels = []
        for episode in dataset.iterate_episodes():
            len_eps = episode.total_timesteps+1
            net_feed = np.zeros(shape=(len_eps+tdim-1,self.net_feed_dim))
            # Preprocess the data
            pp_obs = (episode.observations - self.min_obs) / (self.max_obs - self.min_obs)
            pp_act = ((episode.actions - self.min_act) / (self.max_act - self.min_act))
            rtg = get_ret_to_go(episode, max_score)
            # Fill in the data
            net_feed[tdim-1:, :obs_dim] = pp_obs
            net_feed[tdim:, obs_dim:obs_dim+self.action_dim] = pp_act
            net_feed[tdim-1:-1, -1] = rtg
            # Fill in the labels
            self.labels.append(pp_act)
            # Drop the last timestep because we don't have an action label for it
            net_feed = net_feed[:-1]
            len_eps -= 1
            self.lengths.append(len_eps)
            if not self.ixds:
                self.ixds.append(list(range(len_eps)))
            else:
                self.ixds.append([self.ixds[-1][-1]+1 + i for i in range(len_eps)])
            self.pp_dataset.append(net_feed)

    def __len__(self):
        return sum(self.lengths)
    
    def __getitem__(self, idx):
        episode_idx = 0
        while True:
            if idx in self.ixds[episode_idx]:
                break
            episode_idx += 1
        episode_internal_idx = self.ixds[episode_idx].index(idx)
        return self.pp_dataset[episode_idx][episode_internal_idx:episode_internal_idx+self.tdim], self.labels[episode_idx][episode_internal_idx]

    def make_obs(self, obs, act, ret_to_go):
        if type(obs) == np.ndarray:
            obs = torch.tensor(obs)
        if type(act) == np.ndarray:
            act = torch.tensor(act)
        obs = (obs - self.min_obs) / (self.max_obs - self.min_obs)
        rtg = ret_to_go / self.max_score
        return torch.concatenate([obs, act, torch.tensor([[rtg]], dtype=torch.float32)], dim=1)

    def unwrap_action(self, NN_action):
        return NN_action * (self.max_act - self.min_act) + self.min_act

In [4]:
class HistoryCNN(torch.nn.Module):
    def __init__(self, tdim, obs_dim, act_dim, num_filters):
        super(HistoryCNN, self).__init__()
        assert tdim >= 2
        assert obs_dim > 0
        assert num_filters > 0
        
        self.act_dim = act_dim
        self.obs_dim = obs_dim
        self.num_filters = num_filters
        self.tdim = tdim

        ksize = 2
        kernel_sizes = [ksize]
        dim = tdim - ksize + 1
        ksize += 1
        while dim >= ksize:
            kernel_sizes.append(kernel_sizes[-1] + 1)
            dim -= kernel_sizes[-1] - 1
            ksize += 1

        if dim > 1:
            kernel_sizes.append(dim)

        print("kernel sizes: ", kernel_sizes)
        modules = []
        for i, ksize in enumerate(kernel_sizes):
            if i == 0:
                modules.append((f"conv {i+1}", torch.nn.Conv1d(in_channels=obs_dim, out_channels=num_filters, kernel_size=ksize)))
            else:
                modules.append((f"conv {i+1}", torch.nn.Conv1d(in_channels=num_filters, out_channels=num_filters, kernel_size=ksize)))
            if i != len(kernel_sizes) - 1:
                modules.append((f"bnorm {i+1}", torch.nn.BatchNorm1d(num_filters)))
            modules.append((f"relu {i+1}", torch.nn.ReLU()))
        
        modules.append(("flatten", torch.nn.Flatten()))
        modules.append(("fc", torch.nn.Linear(num_filters, act_dim)))
        modules.append(("tanh", torch.nn.Tanh()))
        self.conv = torch.nn.Sequential(OrderedDict(modules))
    
    def forward(self, x):
        return self.conv(x)


In [37]:
def eval_model(model: HistoryCNN, train_dataset: HistCNNDataset, target_rtg: float, n_test_runs: int=10, device='cpu', render=False):
    model.to(device)
    m_dataset = train_dataset.m_dataset
    if render:
        assert n_test_runs == 1
        env = gym.make(m_dataset._data.env_spec.id, render_mode='human')
        glfw.init()
    else:
        env = gym.make(m_dataset._data.env_spec.id)
    n_scores = []
    for seed in range(n_test_runs):
        raw_obs, _ = env.reset(seed=seed+12341234)
        last_action = torch.zeros(1, model.act_dim)
        running_obs = torch.zeros(1, model.tdim, model.obs_dim)
        pp_obs = train_dataset.make_obs(raw_obs, last_action, target_rtg)
        running_obs[0, -1] = pp_obs
        model.eval()
        done = False

        ret = 0
        while not done:
            if render:
                env.render()
            with torch.no_grad():
                action = model(running_obs.transpose(1,2))
                last_action = copy(action)
                action = train_dataset.unwrap_action(action)
            
            raw_obs, reward, ter, trunc, _ = env.step(action.squeeze().numpy())
            done = ter or trunc
            target_rtg -= reward
            ret += reward
            pp_obs = train_dataset.make_obs(raw_obs, last_action, target_rtg)
            running_obs[0, :-1] = running_obs[0, 1:].clone()
            running_obs[0, -1] = pp_obs
        n_score = minari.get_normalized_score(m_dataset, ret)
        n_scores.append(n_score)
    env.close()
    glfw.terminate()
    return np.average(n_scores), np.std(n_scores), n_scores

In [45]:
def train_model(model, train_loader, device, n_epochs=100):
    # Initialize TensorBoard SummaryWriter
    log_dir = "runs/experiment"
    writer = SummaryWriter(log_dir=log_dir)
    net = model

    # Set up loss criterion, optimizer, and other configurations
    criterion = torch.nn.MSELoss().to(device).float()  # Ensure the criterion is in float32
    optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
    epochs = n_epochs
    best_loss = float('inf')  # Initialize with a high value

    # Convert model to float32 and move to device
    net = net.to(device).float()

    # Create directory to save models
    model_dir = "models"
    os.makedirs(model_dir, exist_ok=True)

    # Training loop with tqdm progress bar
    for epoch in range(epochs):
        running_loss = 0.0
        net = net.to(device).float()
        # Initialize the progress bar for each epoch
        with tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}", unit="batch") as pbar:
            for i, data in enumerate(train_loader):
                inputs, labels = data
                
                # Convert inputs and labels to float32 and move to device
                inputs = inputs.float().transpose(1, 2).to(device)
                labels = labels.float().to(device)

                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # Accumulate loss for logging
                running_loss += loss.item()

                # Update the progress bar
                pbar.set_postfix({"Loss": loss.item()})
                pbar.update(1)

                # Log to TensorBoard every 10 batches
                if i % 10 == 0:
                    writer.add_scalar('Training Loss', loss.item(), epoch * len(train_loader) + i)

        # Calculate and log average loss for the epoch
        avg_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Average Loss: {avg_loss}")
        writer.add_scalar('Average Loss per Epoch', avg_loss, epoch)

        # Save model at the end of each epoch
        torch.save(net.state_dict(), os.path.join(model_dir, f"model_epoch_{epoch+1}.pth"))

        # Check if this is the best model so far and save it if it is
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(net.state_dict(), os.path.join(model_dir, "best_model.pth"))
            print(f"New best model saved with loss {best_loss}")
            n_score = eval_model(model, train_loader.dataset, target_rtg=3600, n_test_runs=50)
            print(f"Current normalized score: {100 * n_score}/100")

    # Close the TensorBoard writer when training is done
    writer.close()


In [68]:
tdim = 10
batch_size = 64
num_filters = 2
m_dataset = dataset = minari.load_dataset("hopper-medium-v2")
max_score = 3600
device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device: ", device)
train_dataset = HistCNNDataset(m_dataset, max_score, tdim)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
testdata = torch.rand(1, 15, 20)
net = HistoryCNN(tdim=tdim, obs_dim=train_dataset.net_feed_dim, act_dim=train_dataset.action_dim, num_filters=num_filters).to(device)

Using device:  mps
Observation Shape:  (11,)
Actions Shape:  (3,)
RTG Shape:  1
Net feed shape:  15
Max obs:  [[ 1.729025    0.1552431   0.01719327  0.07019253  0.95734406  5.017929
   3.1744173   3.8315835   5.686337    9.226352   10.        ]]
Min obs:  [[  0.65998334  -0.19498888  -1.4958394   -1.5184437   -0.96395123
   -0.503773    -5.3455234   -3.20404     -6.4856668  -10.
  -10.        ]]
Max act:  [[0.9998843 0.9999483 0.9999945]]
Min act:  [[-0.9999679  -0.9999835  -0.99996823]]
kernel sizes:  [2, 3, 4, 4]


In [62]:
net.load_state_dict(torch.load("models/model_epoch_4.pth"))

  net.load_state_dict(torch.load("models/model_epoch_4.pth"))


<All keys matched successfully>

In [69]:
# Train model
train_model(net, train_loader, device, n_epochs=5)

Epoch 1/5:   6%|â–Œ         | 917/15625 [03:35<57:29,  4.26batch/s, Loss=0.14]   


KeyboardInterrupt: 

In [67]:
score_m, score_std, n_scores = eval_model(net, train_dataset, target_rtg=3600, n_test_runs=50, render=False)
print(f"Final normalized score: {100 * score_m:.1f}+-{100 * score_std:.1f}")

Final normalized score: 21.2+-13.4


In [66]:
score = eval_model(net, train_dataset, target_rtg=3600, n_test_runs=1, render=True)
print(f"Final normalized score: {100 * score_m:.1f}")

Creating window glfw
Final normalized score: 29.8
