In [13]:
import pfrl
import torch

from torch import nn
import torch.nn
import gym
import numpy
from pfrl.policies import SoftmaxCategoricalHead

In [14]:
env = gym.make('LunarLander-v2')
print("observation space:",env.observation_space)
print('action space:', env.action_space)

obs = env.reset()
print("initial observation:", obs)
print()
action = env.action_space.sample()
obs, r, done, info = env.step(action)

print("next observation",obs)
print("reward:",r)
print("done:",done)
print("info:",info)

observation space: Box(-inf, inf, (8,), float32)
action space: Discrete(4)
initial observation: [ 0.00756063  1.411787    0.76580507  0.03850863 -0.00875421 -0.1734664
  0.          0.        ]

next observation [ 0.01502562  1.4133496   0.75566757  0.06936412 -0.01782299 -0.1813921
  0.          0.        ]
reward: -0.57627459966061
done: False
info: {}


In [15]:
class QFunction(torch.nn.Module):
    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.l1 = torch.nn.Linear(obs_size,  50)
        self.l2 = torch.nn.Linear(50,50)
        self.l3 = torch.nn.Linear(50,n_actions)
    
    def forward(self,x):
        h = x
        h = torch.nn.functional.relu(self.l1(h))
        h = torch.nn.functional.relu(self.l2(h))
        h = self.l3(h)
        return pfrl.action_value.DiscreteActionValue(h)
    
obs_size = env.observation_space.low.size
n_actions = env.action_space.n
q_func = QFunction(obs_size, n_actions)

In [16]:
def lecun_init(layer, gain=1):
    if isinstance(layer, (nn.Conv2d, nn.Linear)):
        pfrl.initializers.init_lecun_normal(layer.weight, gain)
        nn.init.zeros_(layer.bias)
    else:
        pfrl.initializers.init_lecun_normal(layer.weight_ih_l0, gain)
        pfrl.initializers.init_lecun_normal(layer.weight_hh_l0, gain)
        nn.init.zeros_(layer.bias_ih_l0)
        nn.init.zeros_(layer.bias_hh_l0)
    return layer


model = torch.nn.Sequential(
    lecun_init(torch.nn.Linear(obs_size,32)),
    torch.nn.ReLU(),
    lecun_init(torch.nn.Linear(32, 64)),
    torch.nn.ReLU(),
    lecun_init(torch.nn.Linear(64, 128)),
    torch.nn.ReLU(),
    pfrl.nn.Branched(
        torch.nn.Sequential(
            torch.nn.Linear(128, n_actions),
            SoftmaxCategoricalHead(),
        ),
        torch.nn.Linear(128,1)
    ),
)

In [17]:
optimizer = torch.optim.Adam(model.parameters(), eps=1e-5)

In [18]:
gamma = 0.9

explorer = pfrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.1, random_action_func=env.action_space.sample)


replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=10**6)

phi = lambda x : x.astype(numpy.float32, copy=False)

gpu = -1



'\nagent = pfrl.agents.A2C(\n    \n    model = q_func,\n    optimizer = optimizer,\n    gamma = gamma,\n    num_processes = 2,\n)\n\nagent = pfrl.agents.REINFORCE(\n    q_func,\n    optimizer,\n)\n\nagent = pfrl.agents.DoubleDQN(\n    q_func,\n    optimizer,\n    replay_buffer,\n    gamma,\n    explorer,\n    replay_start_size=500,\n    update_interval=1,\n    target_update_interval=100,\n    phi=phi,\n    gpu=gpu,\n)\n'

In [19]:
# Now create an agent that will interact with the environment.

agent = pfrl.agents.PPO(
    model,
    optimizer,
    phi=phi,
    gpu=gpu,
)

In [20]:
n_episodes = 500
max_episode_len = 200

for i in range(1, n_episodes + 1):
    obs = env.reset()
    R = 0
    t = 0
    while True:
        action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        agent.observe(obs, reward, done, reset)
        if done or reset:
            break
            
    if i % 10 == 0:
        print('episode:', i, 'R:', R)
    if i % 50 == 0:
        print('statistics:', agent.get_statistics())
print('Finished.') 
        

episode: 10 R: -332.7852277708553
episode: 20 R: -177.37747685149145
episode: 30 R: -298.58002250933964
episode: 40 R: -145.27106214679347
episode: 50 R: -237.79607983681223
statistics: [('average_value', -59.21073), ('average_entropy', 1.2624002), ('average_value_loss', 301.5330840301514), ('average_policy_loss', 0.019767593815922736), ('n_updates', 640), ('explained_variance', 0.6273739224000227)]
episode: 60 R: -163.16121789487067
episode: 70 R: -80.80046530120964
episode: 80 R: -141.62816323449405
episode: 90 R: -63.42941612096345
episode: 100 R: -59.470657499999604
statistics: [('average_value', -57.422634), ('average_entropy', 1.2427988), ('average_value_loss', 349.0746898651123), ('average_policy_loss', 0.003944343943148851), ('n_updates', 1280), ('explained_variance', 0.2431647020932174)]
episode: 110 R: -79.12802471706831
episode: 120 R: -136.50136112904698
episode: 130 R: -145.7688019874663
episode: 140 R: -89.34818581926976
episode: 150 R: -237.95511503701795
statistics: [('

In [21]:
with agent.eval_mode():
    for i in range(10):
        obs = env.reset()
        R = 0
        t = 0
        while True:
            # Uncomment to watch the behavior in a GUI window
            env.render()
            action = agent.act(obs)
            obs, r, done, _ = env.step(action)
            R += r
            t += 1
            reset = t == 200
            agent.observe(obs, r, done, reset)
            if done or reset:
                break
        print('evaluation episode:', i, 'R:', R)
        

evaluation episode: 0 R: 103.56545010431041
evaluation episode: 1 R: 94.45278799210082
evaluation episode: 2 R: 94.29366186646524
evaluation episode: 3 R: 85.42420986292896
evaluation episode: 4 R: 104.0510659315425
evaluation episode: 5 R: 115.80786061155713
evaluation episode: 6 R: 103.51273417589255
evaluation episode: 7 R: 116.18353404377916
evaluation episode: 8 R: 103.52254523323795
evaluation episode: 9 R: 132.2453684225183


In [10]:
# Save an agent to the 'agent' directory
agent.save('agent')

# Uncomment to load an agent from the 'agent' directory
# agent.load('agent')

In [11]:

# Set up the logger to print info messages for understandability.
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

pfrl.experiments.train_agent_with_evaluation(
    agent,
    env,
    steps=2000,           # Train the agent for 2000 steps
    eval_n_steps=None,       # We evaluate for episodes, not time
    eval_n_episodes=10,       # 10 episodes are sampled for each evaluation
    train_max_episode_len=200,  # Maximum length of each episode
    eval_interval=1000,   # Evaluate the agent after every 1000 steps
    outdir='result',      # Save everything to 'result' directory
)


outdir:result step:90 episode:0 R:-97.73226806886188
statistics:[('average_value', -74.75675), ('average_entropy', 0.7915316), ('average_value_loss', 57.11389946937561), ('average_policy_loss', 0.00021705696359276773), ('n_updates', 3200), ('explained_variance', 0.6381624224785301)]
outdir:result step:163 episode:1 R:-156.16694701401906
statistics:[('average_value', -75.34754), ('average_entropy', 0.78401834), ('average_value_loss', 57.11389946937561), ('average_policy_loss', 0.00021705696359276773), ('n_updates', 3200), ('explained_variance', 0.6381624224785301)]
outdir:result step:222 episode:2 R:-110.49030834650256
statistics:[('average_value', -74.96526), ('average_entropy', 0.7631847), ('average_value_loss', 21.65699993133545), ('average_policy_loss', 0.004735526461154223), ('n_updates', 3520), ('explained_variance', 0.7693223469820605)]
outdir:result step:300 episode:3 R:-111.44865339331764
statistics:[('average_value', -75.016045), ('average_entropy', 0.75832874), ('average_valu

evaluation episode 0 length:76 R:-129.30842691962692
evaluation episode 1 length:62 R:-109.25730375619855
evaluation episode 2 length:63 R:-112.7268614174246
evaluation episode 3 length:60 R:-120.11581729958104
evaluation episode 4 length:58 R:-131.22260848475727
evaluation episode 5 length:59 R:-134.34470931013547
evaluation episode 6 length:76 R:-131.26019100748397
evaluation episode 7 length:54 R:-121.33257227612492
evaluation episode 8 length:59 R:-101.75015808368397
evaluation episode 9 length:73 R:-167.2087619638767
Saved the agent to result/2000_finish


(<pfrl.agents.ppo.PPO at 0x7fbe738196d0>,
 [{'average_value': -73.20655,
   'average_entropy': 0.5989976,
   'average_value_loss': 21.65699993133545,
   'average_policy_loss': 0.004735526461154223,
   'n_updates': 3520,
   'explained_variance': 0.7693223469820605,
   'eval_score': -125.71445087509785},
  {'average_value': -75.712906,
   'average_entropy': 0.5988562,
   'average_value_loss': 21.65699993133545,
   'average_policy_loss': 0.004735526461154223,
   'n_updates': 3520,
   'explained_variance': 0.7693223469820605,
   'eval_score': -125.85274105188934}])