<a href="https://colab.research.google.com/github/psygrammer/pyemotion_rl/blob/master/examples/ch07_higer_level_rl_lib/07_higer_level_rl_lib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ptan==0.6

Collecting ptan==0.6
  Downloading https://files.pythonhosted.org/packages/91/cb/57f6d86625f2b24c008b0524ca29559683aa75d00afa38b6b44d7fcad25b/ptan-0.6.tar.gz
Collecting torch==1.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/b4/0b/9d33aef363b6728ad937643d98be713c6c25d50ce338678ad57cee6e6fd5/torch-1.3.0-cp37-cp37m-manylinux1_x86_64.whl (773.1MB)
[K     |████████████████████████████████| 773.1MB 22kB/s 
Building wheels for collected packages: ptan
  Building wheel for ptan (setup.py) ... [?25l[?25hdone
  Created wheel for ptan: filename=ptan-0.6-cp37-none-any.whl size=23502 sha256=42491563fe468b6d77c1db44554046edb133f494ca2827e3348e057a1b9cb909
  Stored in directory: /root/.cache/pip/wheels/f0/4b/2f/9a45fd39b0a614a2716bc6128a7f1adb4647f323a2d90783f2
Successfully built ptan
[31mERROR: torchvision 0.9.1+cu101 has requirement torch==1.8.1, but you'll have torch 1.3.0 which is incompatible.[0m
[31mERROR: torchtext 0.9.1 has requirement torch==1.8.1, but you'll have tor

In [20]:
import torch
import torch.nn as nn

# Action selectors

In [None]:
# All the classes assume that NumPy arrays will be passed to them. The complete example from this section can be found in Chapter07/01_actions.py.

In [4]:
import numpy as np

In [5]:
import ptan

In [6]:
q_vals = np.array([[1, 2, 3], [1, -1, 0]])

In [7]:
q_vals

array([[ 1,  2,  3],
       [ 1, -1,  0]])

In [8]:
selector = ptan.actions.ArgmaxActionSelector()

In [9]:
selector(q_vals)

array([2, 0])

In [10]:
# As you can see, the selector returns indices of actions with the largest values.

In [11]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0)

In [12]:
selector(q_vals)

array([2, 0])

In [13]:
# The result of the EpsilonGreedyActionSelector application is the same, as epsilon is 0.0, which means no random actions are taken. If we change epsilon to 1, actions will be random:

In [14]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)

In [15]:
selector(q_vals)

array([1, 0])

In [16]:
# Working with ProbabilityActionSelector is the same, but the input needs to be a normalized probability distribution

In [17]:
selector = ptan.actions.ProbabilityActionSelector()

In [18]:
for _ in range(10):
  acts = selector(np.array([
    [0.1, 0.8, 0.1],
    [0.0, 0.0, 1.0],
    [0.5, 0.5, 0.0]]))

  print(acts)

[1 2 1]
[1 2 0]
[2 2 0]
[1 2 1]
[1 2 1]
[1 2 0]
[1 2 1]
[1 2 0]
[1 2 1]
[1 2 1]


# The agent

## DQNAgent

In [21]:
class DQNNet(nn.Module):
  def __init__(self, actions: int):
    super(DQNNet, self).__init__()
    self.actions = actions
       
  def forward(self, x):
    return torch.eye(x.size()[0], self.actions)

In [None]:
# Once we have defined the above class, we can use it as a DQN model:

In [22]:
net = DQNNet(actions=3)

In [23]:
net(torch.zeros(2, 10))

tensor([[1., 0., 0.],
        [0., 1., 0.]])

In [None]:
# We start with the simple argmax policy, so the agent will always return actions corresponding to 1s in the network output.

In [24]:
selector = ptan.actions.ArgmaxActionSelector()

In [25]:
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)

In [26]:
agent(torch.zeros(2, 5))

(array([0, 1]), [None, None])