# Pytorch for reinforcement learning

Here is teh quick introduction to reinforcement learning with pytorch

In [1]:
import pfrl
import torch
import torch.nn
import gym
import numpy

PFRL can be used for any problems if they are modeled as "enviroments". Open AI gym provides various kinds of benchmark environ ments and defined scommon interface among them.

In [2]:
env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)

obs = env.reset()
print('initial observation:', obs)

action = env.action_space.sample()
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)

# Uncomment to open a GUI window rendering the current state of the environment
# env.render()

observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
action space: Discrete(2)
initial observation: [-0.00416541 -0.01051919 -0.02126048  0.02009261]
next observation: [-0.0043758   0.1849011  -0.02085862 -0.27922168]
reward: 1.0
done: False
info: {}


PFRL provides various agents, each of which implements a deep reinforcement learning aglrithm.

Let's try to use DoubleDQN algorithm which is implemented by `pfrl.agents.DoubleDQN`. this algorithm trains a Q-function that receives an observation and returns an expected future return for each action that agent can take. You an define your Q-function as `torch.nn.Module` as below.

In [3]:
class QFunction(torch.nn.Module):
    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.l1 = torch.nn.Linear(obs_size, 50)
        self.l2 = torch.nn.Linear(50, 50)
        self.l3 = torch.nn.Linear(50, n_actions)
        
    def forward(self, x):
        h = x
        h = torch.nn.functional.relu(self.l1(h))
        h = torch.nn.functional.relu(self.l2(h))
        h = self.l3(h)
        
        return pfrl.action_value.DiscreteActionValue(h)
    
obs_size = env.observation_space.low.size
n_actions = env.action_space.n
q_function = QFunction(obs_size, n_actions)

`pfrl.q_fuctions.DiscrenteActionValueHead` is just a torch.nn.Module that packs ints input to `pfrl.action_value.DiscreteActionValue`

As usual in PyTorch, `torch.optim.Optimizer` is used to optimize a model


In [5]:
# Use Adam to optimize q_func. eps=1e-2 is for stability.
optimizer = torch.optim.Adam(q_function.parameters(), eps=1e-2)

# Create Agent

In [9]:
gamma = 0.9

# use epsilon-greedy for exploration
explorer = pfrl.explorers.ConstantEpsilonGreedy(epsilon=0.3, random_action_func= env.action_space.sample)

# DQN uses experience replay
replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=10**6)


# since the observations from CartPole-v0 is numpy.float64 
# whie as pytorch only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi
phi = lambda x: x.astype(numpy.float32, copy=False)

gpu = -1

agent = pfrl.agents.DoubleDQN(
    q_function ,
    optimizer,
    replay_buffer,
    gamma,
    explorer,
    replay_start_size=500,
    update_interval=1,
    target_update_interval=100,
    phi=phi,
    gpu=gpu
)

Now that you have an agent and an environment, it's time to start reinforcement learning!

During training, two methods of `agent` must be called: `agent.act` and `agent.observe` 

* `agent.act(obs)` takes the curernt observation as input and returns an exploraty action. Once the returned action is processed in env, 

* `agent.observe(obs, reeward, done, reset)` then observes the consequences

- `obs` : next observation
- `reward` : an immediate reward
- `done` : a boolean value set to True if reached a terminal state
- `reset` : a boolean value set to True if an episode is interrupted at a non-terminal state, typically by a time limit


In [15]:
n_episodes = 300
max_episode_len = 200

history = []
for i in range(1 , n_episodes+1):
    obs = env.reset()
    R = 0
    t = 0
    
    while True:
        # Uncomment to watch the behavior in GUI window
        
        action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
        
        reset = t == max_episode_len
        agent.observe(obs, reward, done, reset)
        if done or reset:
            history.append(R)
            break
        
    
    if i %10 == 0:
        print('episode : ',i ,'R: ', R)
        
    if i % 50 == 0:
        print('episode : ', agent.get_statistics())

episode :  10 R:  22.0
episode :  20 R:  54.0
episode :  30 R:  25.0
episode :  40 R:  200.0
episode :  50 R:  15.0
episode :  [('average_q', 9.972574), ('average_loss', 0.07234133171266877), ('cumulative_steps', 32522), ('n_updates', 32023), ('rlen', 32522)]
episode :  60 R:  200.0
episode :  70 R:  165.0
episode :  80 R:  200.0
episode :  90 R:  117.0
episode :  100 R:  22.0
episode :  [('average_q', 9.882232), ('average_loss', 0.05194314862310421), ('cumulative_steps', 38233), ('n_updates', 37734), ('rlen', 38233)]
episode :  110 R:  77.0
episode :  120 R:  20.0
episode :  130 R:  147.0
episode :  140 R:  84.0
episode :  150 R:  200.0
episode :  [('average_q', 9.911912), ('average_loss', 0.03334822844073642), ('cumulative_steps', 43553), ('n_updates', 43054), ('rlen', 43553)]
episode :  160 R:  73.0
episode :  170 R:  106.0
episode :  180 R:  32.0
episode :  190 R:  128.0
episode :  200 R:  94.0
episode :  [('average_q', 9.770783), ('average_loss', 0.04287147892871872), ('cumulative

Now you finished the training the Double DQN agent for 300 episodes. How good is th agent now? You can evaluate it using `with agent.eval_mode()` . Exploration such as epsilon-greedy is not used anymore.

In [12]:
with agent.eval_mode():
    for i in range(10):
        obs = env.reset()
        R = 0
        t = 0
        while True:
            action = agent.act(obs)
            obs, r , done , _ = env.step(action)
            
            R += r
            t += 1
            
            reset = t== 200
            agent.observe(obs, r, done, reset)
            if done or reset:
                break
        
        print('evaluation episode: ', i, 'R: ', R)

evaluation episode:  0 R:  152.0
evaluation episode:  1 R:  141.0
evaluation episode:  2 R:  145.0
evaluation episode:  3 R:  146.0
evaluation episode:  4 R:  139.0
evaluation episode:  5 R:  127.0
evaluation episode:  6 R:  145.0
evaluation episode:  7 R:  153.0
evaluation episode:  8 R:  172.0
evaluation episode:  9 R:  147.0


# Finishing up

In [13]:
agent.save('agent')

# Shortcut

In [14]:
# Set up the logger to print info messages for understandability.
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

pfrl.experiments.train_agent_with_evaluation(
    agent,
    env,
    steps=2000,           # Train the agent for 2000 steps
    eval_n_steps=None,       # We evaluate for episodes, not time
    eval_n_episodes=10,       # 10 episodes are sampled for each evaluation
    train_max_episode_len=200,  # Maximum length of each episode
    eval_interval=1000,   # Evaluate the agent after every 1000 steps
    outdir='result',      # Save everything to 'result' directory
)

outdir:result step:133 episode:0 R:133.0
statistics:[('average_q', 9.835213), ('average_loss', 0.07010247893922497), ('cumulative_steps', 25557), ('n_updates', 25058), ('rlen', 25557)]
outdir:result step:333 episode:1 R:200.0
statistics:[('average_q', 9.901389), ('average_loss', 0.07568743751209694), ('cumulative_steps', 25757), ('n_updates', 25258), ('rlen', 25757)]
outdir:result step:533 episode:2 R:200.0
statistics:[('average_q', 9.913004), ('average_loss', 0.06816803898662328), ('cumulative_steps', 25957), ('n_updates', 25458), ('rlen', 25957)]
outdir:result step:660 episode:3 R:127.0
statistics:[('average_q', 9.853989), ('average_loss', 0.05339874850236811), ('cumulative_steps', 26084), ('n_updates', 25585), ('rlen', 26084)]
outdir:result step:689 episode:4 R:29.0
statistics:[('average_q', 9.882291), ('average_loss', 0.06482254713831935), ('cumulative_steps', 26113), ('n_updates', 25614), ('rlen', 26113)]
outdir:result step:869 episode:5 R:180.0
statistics:[('average_q', 9.903717)

(<pfrl.agents.double_dqn.DoubleDQN at 0x1551b6a00>,
 [{'average_q': 9.848642,
   'average_loss': 0.054336254785303024,
   'cumulative_steps': 26487,
   'n_updates': 25988,
   'rlen': 26487,
   'eval_score': 114.8},
  {'average_q': 9.926972,
   'average_loss': 0.04484743709035684,
   'cumulative_steps': 27424,
   'n_updates': 26925,
   'rlen': 27424,
   'eval_score': 164.8}])

# Rainbow DQN

In [17]:
from pfrl.q_functions import DistributionalDuelingDQN


In [19]:
n_atoms = 51
v_max = 10
v_min = -10

q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
        obs_size,
        n_actions,
        n_atoms,
        v_min,
        v_max,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers,
)print(q_func)

DistributionalDuelingDQN(
  (conv_layers): ModuleList(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  )
  (main_stream): Linear(in_features=3136, out_features=1024, bias=True)
  (a_stream): Linear(in_features=512, out_features=102, bias=True)
  (v_stream): Linear(in_features=512, out_features=51, bias=True)
)
