In [1]:
%load_ext autoreload

# --------------- #
# region: Imports #
import os
import sys
module_path = os.path.abspath('../../..')
if module_path not in sys.path:
  sys.path.insert(0, module_path)
# endregion   #
# --------------- #

In [2]:
from examples.cleanup.env import Cleanup
from examples.cleanup.agents import Agent
from examples.RPG.utils import load_config
from gem.models.grid_cells import positional_embedding
from gem.models.human_player import ModelHumanPlayer
from gem.models.iqn import iRainbowModel
from gem.utils import visual_field, visual_field_multilayer

from IPython.display import clear_output

import argparse
import matplotlib.pyplot as plt
import random

import torch
import numpy as np

cfg = load_config(argparse.Namespace(config='../configs/config.yaml'))

N_AGENTS = 3
agents = []
for i in range(N_AGENTS):
  agents.append(
  Agent(cfg, appearance = cfg.agent.agent.appearance, 
        model = iRainbowModel(
        state_size= [cfg.env.channels, cfg.env.height, cfg.env.width],
        action_size= 6,
        layer_size= 128,
        epsilon= 1.0,
        #device= Union[str, torch.device],
        device = 'cpu',
        seed= 0,
        # iRainbow parameters
        num_frames= 1,
        n_step= 1,
        sync_freq=10,
        model_update_freq= 10,
        BATCH_SIZE= 32,
        BUFFER_SIZE= 4000,
        LR= .001,
        TAU= .001,
        GAMMA= .99,
        N= 32
          )
          )
  )




env = Cleanup(
  cfg, agents
)

In [3]:
data = torch.rand(cfg.env.channels, cfg.env.height, cfg.env.width).unsqueeze(0).unsqueeze(0)

agents[0].model.qnetwork_local.forward(data)

(tensor([[[ 0.0327, -0.3010,  0.0611,  0.0587, -0.0498, -0.1373],
          [ 0.0340, -0.2985,  0.0629,  0.0592, -0.0479, -0.1337],
          [ 0.0331, -0.3015,  0.0595,  0.0573, -0.0512, -0.1355],
          [ 0.0301, -0.3012,  0.0600,  0.0550, -0.0515, -0.1361],
          [ 0.0328, -0.2998,  0.0617,  0.0576, -0.0513, -0.1370],
          [ 0.0302, -0.3015,  0.0602,  0.0559, -0.0516, -0.1365],
          [ 0.0359, -0.2969,  0.0638,  0.0597, -0.0456, -0.1321],
          [ 0.0288, -0.3012,  0.0611,  0.0574, -0.0513, -0.1345]]],
        grad_fn=<ViewBackward0>),
 tensor([[[0.0547],
          [0.2592],
          [0.7171],
          [0.9299],
          [0.1539],
          [0.9226],
          [0.6731],
          [0.8101]]]))

In [4]:
cfg.experiment.epochs = 10000 # override the number of epochs

rewards = []
losses = 0

for epoch in range(cfg.experiment.epochs): # note that the language is not right. epoch is training. episode is the game
    # Reset the environment at the start of each epoch
        for agent in env.agents:
            agent.reset()
        random.shuffle(agents)

        done = 0 
        turn = 0
        losses = 0
        game_points = 0



        while not done:

            turn = turn + 1

            entities = env.get_entities_for_transition()
            # Entity transition
            for entity in entities:
                entity.transition(env)

            # Agent transition
            for agent in agents:
                if (epoch+1) % 100 == 0:
                    agent.model.epsilon = agent.model.epsilon*0.9999

                location_code = positional_embedding(agent.location, env, 3, 3)

                # Get current state
                state = agent.pov(env)

                # Take action based on current state
                action = agent.model.take_action(state.unsqueeze(0))

                (reward,
                next_state,
                done_
                ) = agent.transition(env, state, action)

                if turn >= cfg.experiment.max_turns or done_:
                    done = 1

                exp = (1, (state, action, reward, next_state, done))
                agent.episode_memory.append(exp)
                #TODO: decide on memory update procedures
                agent.model.memory.add(state.squeeze().unsqueeze(0), action, reward, next_state.squeeze().unsqueeze(0), done)

                game_points += reward

        rewards.append(game_points)
        
        # At the end of each epoch, train as long as the batch size is large enough.
        if epoch > 10:
            loss = agent.model.train_model()
            losses += loss
            
        # Calculate the average of the last 100 rewards
        if len(rewards) >= 100:
            avg_last_100_rewards = sum(rewards[-100:]) / 100
        else:
            avg_last_100_rewards = sum(rewards) / len(rewards)
        if epoch % 150 == 0:
            print(f'Epoch: {epoch} - Epsilon: {agents[0].model.epsilon} - Losses {losses} - Avg. last 100 rewards: {avg_last_100_rewards}')
            losses = 0


Epoch: 0 - Epsilon: 1.0 - Losses 0 - Avg. last 100 rewards: 18.0


KeyboardInterrupt: 