# Continuous Control

---

### 1. Start the Environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='apps/Tennis.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import random
import copy
from collections import deque, namedtuple

In [5]:
# Helper Functions
def random_sample(indices, batch_size):
    indices = np.asarray(np.random.permutation(indices))
    batches = indices[:len(indices) // batch_size * batch_size].reshape(-1, batch_size)
    for batch in batches:
        yield batch
    r = len(indices) % batch_size
    if r:
        yield indices[-r:]

def stack_tensor(some_list):
    return torch.cat(some_list[:1000], dim=0)

In [6]:
# Models
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Critic(nn.Module):
    def __init__(self, seed):
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(48, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, states):
        states = states.reshape(-1, 48)
        x = F.tanh(self.fc1(states))
        x = F.tanh(self.fc2(x))
        x = self.fc3(x)
        return x
    
class Actor(nn.Module):
    def __init__(self, seed):
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(24, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 2)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, states):
        x = F.tanh(self.fc1(states))
        x = F.tanh(self.fc2(x))
        x = F.tanh(self.fc3(x))
        return x  
    
class ActorAndCritic(nn.Module):
    def __init__(self, seed):
        super(ActorAndCritic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.sd = nn.Parameter(torch.zeros(2))
        self.actor = Actor(1)
        self.critic = Critic(1)
        
    def forward(self, states, states_part):
        mean = self.actor(states_part)
        values = self.critic(states)
        dist = torch.distributions.Normal(mean, F.softplus(self.sd))
        actions = torch.clamp(dist.sample(), -1, 1)
        if len(actions) == 2:
            actions = actions.unsqueeze(0)
#         print(dist.log_prob(actions).mean(1).shape)
        return actions.squeeze(0), values.squeeze(0), dist.log_prob(actions).mean(1), dist

In [7]:
class Agent(nn.Module):
    def __init__(self):
        super(Agent, self).__init__()
        self.network = ActorAndCritic(5)
        self.optimizer = optim.Adam(self.network.parameters(), lr = 1e-4)
    
    def act(self, states, states_part):
        with torch.no_grad():
            actions, values, log_probs, dist = self.network(states, states_part)
        return actions, values, log_probs, dist
    
    def step(self, old_log_probs, states, states_part, actions, rewards, dones, values, advantages, returns):
        for e in range(5):
            samples = random_sample(np.arange(500), 64)
            for sample in samples:
                states_1 = states[sample]
                states_part_1 = states_part[sample]
                actions_1 = actions[sample]
                old_log_probs_1 = old_log_probs[sample]
                advantages_1 = advantages[sample]
                returns_1 = returns[sample]
                
                _, values_1, _, dist = self.network(states_1, states_part_1)
                new_log_probs_1 = dist.log_prob(actions_1).mean(1).unsqueeze(1)

                new_probs, old_probs = torch.exp(new_log_probs_1), torch.exp(old_log_probs_1)
                ratios = new_probs / old_probs
                ratios_clipped = torch.clamp(ratios, 0.9, 1.1)
                entropy = -(new_probs * torch.log(old_probs + 1e-10) + (1.0 - new_probs) * torch.log(1.0 - old_probs + 1e-10))
                entropy[torch.isnan(entropy)] = 0
                
                rewards_clipped = -torch.min(ratios * advantages_1, ratios_clipped * advantages_1)
                rewards_clipped = torch.mean(rewards_clipped + 0.01 * entropy)
                loss = torch.mean((returns_1 - values_1)**2)
                total_loss = rewards_clipped + 2 * loss
                
                self.optimizer.zero_grad()
                total_loss.backward()
                nn.utils.clip_grad_norm_(self.network.parameters(), 10) 
                self.optimizer.step()

In [8]:
class MAPPO(nn.Module):
    def __init__(self):
        super(MAPPO, self).__init__()
        self.mappo = [Agent(), Agent()]
    
    def act(self, states):
        actions0, values0, log_probs0, _ = self.mappo[0].act(states, states[0])
        actions1, values1, log_probs1, _ = self.mappo[1].act(states, states[1])
        return torch.stack((actions0, actions1)), torch.stack((values0, values1)), torch.stack((log_probs0, log_probs1))
 
    def step(self, old_log_probs, states, actions, rewards, dones, values):
        rewards = torch.tensor(rewards, dtype = torch.float).unsqueeze(2)
        dones = torch.tensor(dones, dtype = torch.int).unsqueeze(2)
        
        advantages = [0.0] * 500
        returns = [0.0] * 500
        advantage = torch.zeros((2, 1))
        return_ = torch.zeros((2, 1))
        for i in reversed(range(500)):
            if i == 500 - 1:
                td = (rewards[i] - values[i]).squeeze(0)
            else:
                td = (rewards[i] + (0.99 * (1-dones[i]) * values[i + 1]) - values[i]).squeeze(0)
            advantage = advantage * 0.95 * 0.99 * (1-dones[i]) + td
            return_ = rewards[i] + 0.99 * (1-dones[i]) * return_           
            advantages[i] = advantage
            returns[i] = return_
            
        old_log_probs, states, actions, values, advantages, returns = \
        torch.stack(old_log_probs), torch.stack(states), torch.stack(actions), \
        torch.stack(values), torch.stack(advantages), torch.stack(returns)
        advantages = torch.clamp((advantages - advantages.mean()) / advantages.std(), -1e5, 1e5)
        
        for i, mappo in enumerate(self.mappo):
            mappo.step(old_log_probs[:,i,:], states, states[:,i,:], actions[:,i,:], rewards[:,i,:], dones[:,i,:], values[:,i,:], advantages[:,i,:], returns[:,i,:])
            
mappo = MAPPO()

In [None]:
scores = []
scores_window = deque(maxlen=100)
log_probs_all, states_all, actions_all, rewards_all, dones_all, values_all = [], [], [], [], [], []
sample_num = 0
for i in range(15000):                                         
    env_info = env.reset(train_mode=True)[brain_name]     
    states = torch.tensor(env_info.vector_observations, dtype = torch.float)
    score = np.zeros(num_agents) 
    while True:
        actions, values, log_probs = mappo.act(states)
        env_info = env.step(actions.detach().numpy())[brain_name]           
        next_states = torch.tensor(env_info.vector_observations, dtype = torch.float)
        dones = env_info.local_done
        
        log_probs_all.append(log_probs)
        states_all.append(states)
        actions_all.append(actions)
        rewards_all.append(env_info.rewards)
        dones_all.append(dones)
        values_all.append(values)
        score += env_info.rewards
        states = next_states 
        
        if sample_num == 499:
            mappo.step(log_probs_all, states_all, actions_all, rewards_all, dones_all, values_all)
            log_probs_all, states_all, actions_all, rewards_all, dones_all, values_all = [], [], [], [], [], []
        
        sample_num = (sample_num + 1) % 500
        if np.any(dones):                                  
            break
        
    scores_window.append(score)       
    scores.append(score)
    print('\rEpisode {}\tReward: {:.5f}\tAverage Reward: {:.5f}'.format(i, np.mean(score), np.mean(scores_window)))



Episode 0	Reward: -0.00500	Average Reward: -0.00500
Episode 1	Reward: -0.00500	Average Reward: -0.00500
Episode 2	Reward: -0.00500	Average Reward: -0.00500
Episode 3	Reward: -0.00500	Average Reward: -0.00500
Episode 4	Reward: -0.00500	Average Reward: -0.00500
Episode 5	Reward: -0.00500	Average Reward: -0.00500
Episode 6	Reward: -0.00500	Average Reward: -0.00500
Episode 7	Reward: -0.00500	Average Reward: -0.00500
Episode 8	Reward: -0.00500	Average Reward: -0.00500
Episode 9	Reward: -0.00500	Average Reward: -0.00500
Episode 10	Reward: 0.04500	Average Reward: -0.00045
Episode 11	Reward: -0.00500	Average Reward: -0.00083
Episode 12	Reward: -0.00500	Average Reward: -0.00115
Episode 13	Reward: -0.00500	Average Reward: -0.00143
Episode 14	Reward: -0.00500	Average Reward: -0.00167
Episode 15	Reward: -0.00500	Average Reward: -0.00187
Episode 16	Reward: -0.00500	Average Reward: -0.00206
Episode 17	Reward: 0.04500	Average Reward: 0.00056
Episode 18	Reward: -0.00500	Average Reward: 0.00026
Episode

Episode 159	Reward: -0.00500	Average Reward: 0.00550
Episode 160	Reward: -0.00500	Average Reward: 0.00500
Episode 161	Reward: 0.09500	Average Reward: 0.00550
Episode 162	Reward: 0.04500	Average Reward: 0.00600
Episode 163	Reward: -0.00500	Average Reward: 0.00600
Episode 164	Reward: -0.00500	Average Reward: 0.00600
Episode 165	Reward: 0.04500	Average Reward: 0.00650
Episode 166	Reward: -0.00500	Average Reward: 0.00650
Episode 167	Reward: -0.00500	Average Reward: 0.00650
Episode 168	Reward: -0.00500	Average Reward: 0.00600
Episode 169	Reward: -0.00500	Average Reward: 0.00600
Episode 170	Reward: -0.00500	Average Reward: 0.00600
Episode 171	Reward: 0.04500	Average Reward: 0.00650
Episode 172	Reward: 0.04500	Average Reward: 0.00700
Episode 173	Reward: -0.00500	Average Reward: 0.00700
Episode 174	Reward: -0.00500	Average Reward: 0.00650
Episode 175	Reward: 0.09500	Average Reward: 0.00750
Episode 176	Reward: -0.00500	Average Reward: 0.00750
Episode 177	Reward: 0.04500	Average Reward: 0.00800


Episode 316	Reward: -0.00500	Average Reward: 0.00895
Episode 317	Reward: 0.04500	Average Reward: 0.00895
Episode 318	Reward: 0.04500	Average Reward: 0.00945
Episode 319	Reward: -0.00500	Average Reward: 0.00945
Episode 320	Reward: -0.00500	Average Reward: 0.00945
Episode 321	Reward: -0.00500	Average Reward: 0.00945
Episode 322	Reward: -0.00500	Average Reward: 0.00945
Episode 323	Reward: -0.00500	Average Reward: 0.00945
Episode 324	Reward: 0.04500	Average Reward: 0.00995
Episode 325	Reward: 0.04500	Average Reward: 0.01045
Episode 326	Reward: -0.00500	Average Reward: 0.01045
Episode 327	Reward: -0.00500	Average Reward: 0.00945
Episode 328	Reward: 0.04500	Average Reward: 0.00995
Episode 329	Reward: 0.04500	Average Reward: 0.01045
Episode 330	Reward: -0.00500	Average Reward: 0.01045
Episode 331	Reward: -0.00500	Average Reward: 0.01045
Episode 332	Reward: -0.00500	Average Reward: 0.01045
Episode 333	Reward: 0.04500	Average Reward: 0.01095
Episode 334	Reward: -0.00500	Average Reward: 0.01045


Episode 473	Reward: -0.00500	Average Reward: 0.01050
Episode 474	Reward: -0.00500	Average Reward: 0.01050
Episode 475	Reward: -0.00500	Average Reward: 0.01000
Episode 476	Reward: -0.00500	Average Reward: 0.01000
Episode 477	Reward: -0.00500	Average Reward: 0.00950
Episode 478	Reward: 0.09500	Average Reward: 0.01050
Episode 479	Reward: -0.00500	Average Reward: 0.01050
Episode 480	Reward: -0.00500	Average Reward: 0.01050
Episode 481	Reward: -0.00500	Average Reward: 0.01050
Episode 482	Reward: 0.09500	Average Reward: 0.01150
Episode 483	Reward: -0.00500	Average Reward: 0.01150
Episode 484	Reward: -0.00500	Average Reward: 0.01100
Episode 485	Reward: -0.00500	Average Reward: 0.01100
Episode 486	Reward: 0.04500	Average Reward: 0.01150
Episode 487	Reward: -0.00500	Average Reward: 0.01150
Episode 488	Reward: -0.00500	Average Reward: 0.01150
Episode 489	Reward: -0.00500	Average Reward: 0.01100
Episode 490	Reward: -0.00500	Average Reward: 0.01100
Episode 491	Reward: 0.04500	Average Reward: 0.011

Episode 629	Reward: 0.04500	Average Reward: 0.01250
Episode 630	Reward: 0.04500	Average Reward: 0.01300
Episode 631	Reward: -0.00500	Average Reward: 0.01300
Episode 632	Reward: -0.00500	Average Reward: 0.01300
Episode 633	Reward: 0.04500	Average Reward: 0.01250
Episode 634	Reward: -0.00500	Average Reward: 0.01200
Episode 635	Reward: -0.00500	Average Reward: 0.01200
Episode 636	Reward: -0.00500	Average Reward: 0.01200
Episode 637	Reward: -0.00500	Average Reward: 0.01200
Episode 638	Reward: -0.00500	Average Reward: 0.01150
Episode 639	Reward: -0.00500	Average Reward: 0.01100
Episode 640	Reward: -0.00500	Average Reward: 0.01100
Episode 641	Reward: 0.04500	Average Reward: 0.01050
Episode 642	Reward: 0.04500	Average Reward: 0.01050
Episode 643	Reward: -0.00500	Average Reward: 0.01000
Episode 644	Reward: -0.00500	Average Reward: 0.01000
Episode 645	Reward: -0.00500	Average Reward: 0.01000
Episode 646	Reward: -0.00500	Average Reward: 0.01000
Episode 647	Reward: -0.00500	Average Reward: 0.0100

Episode 786	Reward: 0.04500	Average Reward: 0.01400
Episode 787	Reward: -0.00500	Average Reward: 0.01400
Episode 788	Reward: -0.00500	Average Reward: 0.01400
Episode 789	Reward: 0.04500	Average Reward: 0.01450
Episode 790	Reward: 0.04500	Average Reward: 0.01450
Episode 791	Reward: 0.04500	Average Reward: 0.01500
Episode 792	Reward: 0.04500	Average Reward: 0.01550
Episode 793	Reward: -0.00500	Average Reward: 0.01550
Episode 794	Reward: -0.00500	Average Reward: 0.01500
Episode 795	Reward: -0.00500	Average Reward: 0.01500
Episode 796	Reward: -0.00500	Average Reward: 0.01500
Episode 797	Reward: -0.00500	Average Reward: 0.01500
Episode 798	Reward: 0.04500	Average Reward: 0.01550
Episode 799	Reward: -0.00500	Average Reward: 0.01550
Episode 800	Reward: 0.24500	Average Reward: 0.01800
Episode 801	Reward: -0.00500	Average Reward: 0.01750
Episode 802	Reward: -0.00500	Average Reward: 0.01650
Episode 803	Reward: -0.00500	Average Reward: 0.01650
Episode 804	Reward: -0.00500	Average Reward: 0.01650


Episode 943	Reward: -0.00500	Average Reward: 0.02100
Episode 944	Reward: 0.04500	Average Reward: 0.02150
Episode 945	Reward: 0.09500	Average Reward: 0.02250
Episode 946	Reward: -0.00500	Average Reward: 0.02200
Episode 947	Reward: 0.04500	Average Reward: 0.02150
Episode 948	Reward: -0.00500	Average Reward: 0.02150
Episode 949	Reward: -0.00500	Average Reward: 0.02150
Episode 950	Reward: -0.00500	Average Reward: 0.02150
Episode 951	Reward: -0.00500	Average Reward: 0.02150
Episode 952	Reward: -0.00500	Average Reward: 0.02100
Episode 953	Reward: -0.00500	Average Reward: 0.02100
Episode 954	Reward: 0.04500	Average Reward: 0.02150
Episode 955	Reward: 0.04500	Average Reward: 0.02200
Episode 956	Reward: 0.04500	Average Reward: 0.02250
Episode 957	Reward: -0.00500	Average Reward: 0.02200
Episode 958	Reward: -0.00500	Average Reward: 0.02200
Episode 959	Reward: 0.14500	Average Reward: 0.02350
Episode 960	Reward: 0.14500	Average Reward: 0.02500
Episode 961	Reward: 0.04500	Average Reward: 0.02500
Ep

Episode 1097	Reward: 0.09500	Average Reward: 0.01800
Episode 1098	Reward: -0.00500	Average Reward: 0.01800
Episode 1099	Reward: -0.00500	Average Reward: 0.01800
Episode 1100	Reward: -0.00500	Average Reward: 0.01800
Episode 1101	Reward: -0.00500	Average Reward: 0.01800
Episode 1102	Reward: 0.04500	Average Reward: 0.01800
Episode 1103	Reward: -0.00500	Average Reward: 0.01800
Episode 1104	Reward: -0.00500	Average Reward: 0.01800
Episode 1105	Reward: 0.04500	Average Reward: 0.01850
Episode 1106	Reward: 0.14500	Average Reward: 0.02000
Episode 1107	Reward: 0.04500	Average Reward: 0.02050
Episode 1108	Reward: -0.00500	Average Reward: 0.02050
Episode 1109	Reward: -0.00500	Average Reward: 0.02050
Episode 1110	Reward: -0.00500	Average Reward: 0.02050
Episode 1111	Reward: 0.04500	Average Reward: 0.02000
Episode 1112	Reward: 0.04500	Average Reward: 0.02050
Episode 1113	Reward: 0.04500	Average Reward: 0.02050
Episode 1114	Reward: 0.04500	Average Reward: 0.02100
Episode 1115	Reward: 0.04500	Average 

Episode 1251	Reward: 0.04500	Average Reward: 0.03000
Episode 1252	Reward: 0.04500	Average Reward: 0.03050
Episode 1253	Reward: -0.00500	Average Reward: 0.03050
Episode 1254	Reward: -0.00500	Average Reward: 0.03000
Episode 1255	Reward: -0.00500	Average Reward: 0.03000
Episode 1256	Reward: 0.04500	Average Reward: 0.02950
Episode 1257	Reward: -0.00500	Average Reward: 0.02900
Episode 1258	Reward: 0.04500	Average Reward: 0.02950
Episode 1259	Reward: 0.09500	Average Reward: 0.03000
Episode 1260	Reward: -0.00500	Average Reward: 0.03000
Episode 1261	Reward: 0.04500	Average Reward: 0.03000
Episode 1262	Reward: -0.00500	Average Reward: 0.03000
Episode 1263	Reward: 0.04500	Average Reward: 0.03050
Episode 1264	Reward: 0.04500	Average Reward: 0.03050
Episode 1265	Reward: 0.04500	Average Reward: 0.03100
Episode 1266	Reward: -0.00500	Average Reward: 0.03050
Episode 1267	Reward: 0.04500	Average Reward: 0.03050
Episode 1268	Reward: -0.00500	Average Reward: 0.03050
Episode 1269	Reward: -0.00500	Average 

Episode 1405	Reward: -0.00500	Average Reward: 0.03400
Episode 1406	Reward: -0.00500	Average Reward: 0.03400
Episode 1407	Reward: 0.04500	Average Reward: 0.03450
Episode 1408	Reward: 0.04500	Average Reward: 0.03450
Episode 1409	Reward: 0.04500	Average Reward: 0.03450
Episode 1410	Reward: 0.14500	Average Reward: 0.03550
Episode 1411	Reward: 0.04500	Average Reward: 0.03600
Episode 1412	Reward: -0.00500	Average Reward: 0.03600
Episode 1413	Reward: 0.04500	Average Reward: 0.03600
Episode 1414	Reward: 0.04500	Average Reward: 0.03600
Episode 1415	Reward: 0.04500	Average Reward: 0.03600
Episode 1416	Reward: 0.04500	Average Reward: 0.03600
Episode 1417	Reward: 0.04500	Average Reward: 0.03600
Episode 1418	Reward: 0.09500	Average Reward: 0.03650
Episode 1419	Reward: -0.00500	Average Reward: 0.03600
Episode 1420	Reward: 0.04500	Average Reward: 0.03600
Episode 1421	Reward: 0.04500	Average Reward: 0.03650
Episode 1422	Reward: 0.04500	Average Reward: 0.03700
Episode 1423	Reward: -0.00500	Average Rewa

Episode 1560	Reward: -0.00500	Average Reward: 0.02995
Episode 1561	Reward: 0.04500	Average Reward: 0.02995
Episode 1562	Reward: -0.00500	Average Reward: 0.02995
Episode 1563	Reward: 0.04500	Average Reward: 0.03045
Episode 1564	Reward: 0.04500	Average Reward: 0.03095
Episode 1565	Reward: 0.04500	Average Reward: 0.03145
Episode 1566	Reward: 0.04500	Average Reward: 0.03195
Episode 1567	Reward: 0.09500	Average Reward: 0.03295
Episode 1568	Reward: 0.04500	Average Reward: 0.03295
Episode 1569	Reward: -0.00500	Average Reward: 0.03245
Episode 1570	Reward: 0.04500	Average Reward: 0.03245
Episode 1571	Reward: 0.04500	Average Reward: 0.03245
Episode 1572	Reward: -0.00500	Average Reward: 0.03195
Episode 1573	Reward: -0.00500	Average Reward: 0.03145
Episode 1574	Reward: 0.04500	Average Reward: 0.03095
Episode 1575	Reward: 0.04500	Average Reward: 0.02995
Episode 1576	Reward: -0.00500	Average Reward: 0.02895
Episode 1577	Reward: -0.00500	Average Reward: 0.02845
Episode 1578	Reward: -0.00500	Average R

Episode 1714	Reward: 0.04500	Average Reward: 0.03350
Episode 1715	Reward: 0.04500	Average Reward: 0.03350
Episode 1716	Reward: -0.00500	Average Reward: 0.03300
Episode 1717	Reward: -0.00500	Average Reward: 0.03250
Episode 1718	Reward: 0.04500	Average Reward: 0.03250
Episode 1719	Reward: 0.04500	Average Reward: 0.03250
Episode 1720	Reward: -0.00500	Average Reward: 0.03250
Episode 1721	Reward: 0.09500	Average Reward: 0.03300
Episode 1722	Reward: -0.00500	Average Reward: 0.03300
Episode 1723	Reward: 0.04500	Average Reward: 0.03300
Episode 1724	Reward: -0.00500	Average Reward: 0.03250
Episode 1725	Reward: 0.14500	Average Reward: 0.03300
Episode 1726	Reward: -0.00500	Average Reward: 0.03200
Episode 1727	Reward: 0.04500	Average Reward: 0.03200
Episode 1728	Reward: 0.04500	Average Reward: 0.03250
Episode 1729	Reward: -0.00500	Average Reward: 0.03250
Episode 1730	Reward: -0.00500	Average Reward: 0.03200
Episode 1731	Reward: -0.00500	Average Reward: 0.03150
Episode 1732	Reward: 0.09500	Average 

Episode 1869	Reward: 0.19500	Average Reward: 0.03400
Episode 1870	Reward: 0.04500	Average Reward: 0.03400
Episode 1871	Reward: 0.14500	Average Reward: 0.03500
Episode 1872	Reward: -0.00500	Average Reward: 0.03500
Episode 1873	Reward: 0.04500	Average Reward: 0.03550
Episode 1874	Reward: 0.04500	Average Reward: 0.03550
Episode 1875	Reward: 0.09000	Average Reward: 0.03645
Episode 1876	Reward: 0.04500	Average Reward: 0.03695
Episode 1877	Reward: -0.00500	Average Reward: 0.03645
Episode 1878	Reward: 0.04500	Average Reward: 0.03645
Episode 1879	Reward: 0.04500	Average Reward: 0.03595
Episode 1880	Reward: -0.00500	Average Reward: 0.03545
Episode 1881	Reward: 0.04500	Average Reward: 0.03495
Episode 1882	Reward: 0.04500	Average Reward: 0.03545
Episode 1883	Reward: 0.09500	Average Reward: 0.03595
Episode 1884	Reward: -0.00500	Average Reward: 0.03595
Episode 1885	Reward: 0.09500	Average Reward: 0.03695
Episode 1886	Reward: 0.09500	Average Reward: 0.03745
Episode 1887	Reward: 0.04500	Average Rewar

Episode 2025	Reward: 0.04500	Average Reward: 0.03250
Episode 2026	Reward: 0.04500	Average Reward: 0.03250
Episode 2027	Reward: 0.04500	Average Reward: 0.03300
Episode 2028	Reward: 0.04500	Average Reward: 0.03350
Episode 2029	Reward: 0.04500	Average Reward: 0.03300
Episode 2030	Reward: -0.00500	Average Reward: 0.03250
Episode 2031	Reward: -0.00500	Average Reward: 0.03150
Episode 2032	Reward: 0.04500	Average Reward: 0.03150
Episode 2033	Reward: 0.04500	Average Reward: 0.03200
Episode 2034	Reward: -0.00500	Average Reward: 0.03200
Episode 2035	Reward: 0.04500	Average Reward: 0.03250
Episode 2036	Reward: 0.04500	Average Reward: 0.03250
Episode 2037	Reward: -0.00500	Average Reward: 0.03200
Episode 2038	Reward: 0.04500	Average Reward: 0.03250
Episode 2039	Reward: 0.04500	Average Reward: 0.03250
Episode 2040	Reward: 0.04500	Average Reward: 0.03300
Episode 2041	Reward: 0.04500	Average Reward: 0.03250
Episode 2042	Reward: -0.00500	Average Reward: 0.03250
Episode 2043	Reward: 0.04500	Average Rewa

Episode 2180	Reward: 0.04500	Average Reward: 0.03250
Episode 2181	Reward: -0.00500	Average Reward: 0.03150
Episode 2182	Reward: 0.04500	Average Reward: 0.03150
Episode 2183	Reward: 0.04500	Average Reward: 0.03200
Episode 2184	Reward: 0.04500	Average Reward: 0.03200
Episode 2185	Reward: -0.00500	Average Reward: 0.03150
Episode 2186	Reward: -0.00500	Average Reward: 0.03150
Episode 2187	Reward: 0.04500	Average Reward: 0.03200
Episode 2188	Reward: 0.09500	Average Reward: 0.03300
Episode 2189	Reward: 0.04500	Average Reward: 0.03350
Episode 2190	Reward: 0.04500	Average Reward: 0.03350
Episode 2191	Reward: 0.04500	Average Reward: 0.03350
Episode 2192	Reward: 0.04500	Average Reward: 0.03350
Episode 2193	Reward: 0.14500	Average Reward: 0.03450
Episode 2194	Reward: -0.00500	Average Reward: 0.03450
Episode 2195	Reward: -0.00500	Average Reward: 0.03400
Episode 2196	Reward: -0.00500	Average Reward: 0.03350
Episode 2197	Reward: 0.04500	Average Reward: 0.03350
Episode 2198	Reward: -0.00500	Average Re

Episode 2335	Reward: 0.09500	Average Reward: 0.03645
Episode 2336	Reward: 0.04500	Average Reward: 0.03645
Episode 2337	Reward: -0.00500	Average Reward: 0.03645
Episode 2338	Reward: 0.04500	Average Reward: 0.03595
Episode 2339	Reward: 0.09500	Average Reward: 0.03645
Episode 2340	Reward: 0.04500	Average Reward: 0.03595
Episode 2341	Reward: -0.00500	Average Reward: 0.03545
Episode 2342	Reward: 0.04500	Average Reward: 0.03595
Episode 2343	Reward: 0.09500	Average Reward: 0.03695
Episode 2344	Reward: 0.04500	Average Reward: 0.03745
Episode 2345	Reward: 0.04500	Average Reward: 0.03745
Episode 2346	Reward: -0.00500	Average Reward: 0.03645
Episode 2347	Reward: 0.04500	Average Reward: 0.03645
Episode 2348	Reward: -0.00500	Average Reward: 0.03645
Episode 2349	Reward: 0.04500	Average Reward: 0.03595
Episode 2350	Reward: 0.04500	Average Reward: 0.03645
Episode 2351	Reward: 0.04500	Average Reward: 0.03645
Episode 2352	Reward: 0.09500	Average Reward: 0.03695
Episode 2353	Reward: 0.04500	Average Rewar

Episode 2490	Reward: 0.04500	Average Reward: 0.04100
Episode 2491	Reward: -0.00500	Average Reward: 0.04100
Episode 2492	Reward: 0.09500	Average Reward: 0.04150
Episode 2493	Reward: -0.00500	Average Reward: 0.04150
Episode 2494	Reward: 0.04500	Average Reward: 0.04100
Episode 2495	Reward: 0.04500	Average Reward: 0.04050
Episode 2496	Reward: 0.04500	Average Reward: 0.04100
Episode 2497	Reward: 0.04500	Average Reward: 0.04150
Episode 2498	Reward: 0.09500	Average Reward: 0.04250
Episode 2499	Reward: 0.09500	Average Reward: 0.04300
Episode 2500	Reward: 0.09500	Average Reward: 0.04400
Episode 2501	Reward: 0.04500	Average Reward: 0.04400
Episode 2502	Reward: -0.00500	Average Reward: 0.04350
Episode 2503	Reward: -0.00500	Average Reward: 0.04300
Episode 2504	Reward: -0.00500	Average Reward: 0.04250
Episode 2505	Reward: 0.09500	Average Reward: 0.04300
Episode 2506	Reward: 0.09500	Average Reward: 0.04350
Episode 2507	Reward: 0.04500	Average Reward: 0.04350
Episode 2508	Reward: 0.04500	Average Rewa

Episode 2645	Reward: 0.04500	Average Reward: 0.04250
Episode 2646	Reward: -0.00500	Average Reward: 0.04200
Episode 2647	Reward: 0.09500	Average Reward: 0.04250
Episode 2648	Reward: 0.04500	Average Reward: 0.04250
Episode 2649	Reward: 0.04500	Average Reward: 0.04200
Episode 2650	Reward: 0.04500	Average Reward: 0.04250
Episode 2651	Reward: 0.04500	Average Reward: 0.04300
Episode 2652	Reward: 0.04500	Average Reward: 0.04300
Episode 2653	Reward: 0.04500	Average Reward: 0.04200
Episode 2654	Reward: 0.04500	Average Reward: 0.04250
Episode 2655	Reward: 0.04500	Average Reward: 0.04200
Episode 2656	Reward: 0.04500	Average Reward: 0.04100
Episode 2657	Reward: -0.00500	Average Reward: 0.04100
Episode 2658	Reward: 0.04500	Average Reward: 0.04050
Episode 2659	Reward: 0.04500	Average Reward: 0.04050
Episode 2660	Reward: 0.04500	Average Reward: 0.04100
Episode 2661	Reward: 0.04500	Average Reward: 0.04100
Episode 2662	Reward: 0.04500	Average Reward: 0.04150
Episode 2663	Reward: 0.04500	Average Reward:

Episode 2799	Reward: 0.14500	Average Reward: 0.05395
Episode 2800	Reward: 0.09500	Average Reward: 0.05445
Episode 2801	Reward: 0.04500	Average Reward: 0.05445
Episode 2802	Reward: 0.04500	Average Reward: 0.05395
Episode 2803	Reward: 0.09500	Average Reward: 0.05395
Episode 2804	Reward: -0.00500	Average Reward: 0.05345
Episode 2805	Reward: 0.04500	Average Reward: 0.05300
Episode 2806	Reward: 0.09500	Average Reward: 0.05400
Episode 2807	Reward: 0.04500	Average Reward: 0.05350
Episode 2808	Reward: 0.09500	Average Reward: 0.05450
Episode 2809	Reward: 0.09500	Average Reward: 0.05550
Episode 2810	Reward: 0.09500	Average Reward: 0.05600
Episode 2811	Reward: 0.09500	Average Reward: 0.05600
Episode 2812	Reward: 0.04500	Average Reward: 0.05550
Episode 2813	Reward: 0.04500	Average Reward: 0.05550
Episode 2814	Reward: 0.09500	Average Reward: 0.05650
Episode 2815	Reward: 0.04500	Average Reward: 0.05600
Episode 2816	Reward: -0.00500	Average Reward: 0.05550
Episode 2817	Reward: 0.09500	Average Reward:

Episode 2954	Reward: 0.04500	Average Reward: 0.04395
Episode 2955	Reward: 0.04500	Average Reward: 0.04350
Episode 2956	Reward: 0.04500	Average Reward: 0.04350
Episode 2957	Reward: 0.04500	Average Reward: 0.04350
Episode 2958	Reward: 0.04500	Average Reward: 0.04300
Episode 2959	Reward: -0.00500	Average Reward: 0.04300
Episode 2960	Reward: 0.04500	Average Reward: 0.04300
Episode 2961	Reward: -0.00500	Average Reward: 0.04250
Episode 2962	Reward: 0.04500	Average Reward: 0.04300
Episode 2963	Reward: 0.04500	Average Reward: 0.04350
Episode 2964	Reward: 0.04500	Average Reward: 0.04350
Episode 2965	Reward: -0.00500	Average Reward: 0.04250
Episode 2966	Reward: 0.04500	Average Reward: 0.04250
Episode 2967	Reward: 0.09500	Average Reward: 0.04350
Episode 2968	Reward: 0.04500	Average Reward: 0.04350
Episode 2969	Reward: 0.04500	Average Reward: 0.04400
Episode 2970	Reward: 0.04500	Average Reward: 0.04400
Episode 2971	Reward: 0.14500	Average Reward: 0.04500
Episode 2972	Reward: -0.00500	Average Rewar

Episode 3108	Reward: -0.00500	Average Reward: 0.04000
Episode 3109	Reward: 0.29500	Average Reward: 0.04250
Episode 3110	Reward: 0.04500	Average Reward: 0.04250
Episode 3111	Reward: -0.00500	Average Reward: 0.04200
Episode 3112	Reward: 0.09500	Average Reward: 0.04300
Episode 3113	Reward: 0.04500	Average Reward: 0.04300
Episode 3114	Reward: 0.04500	Average Reward: 0.04200
Episode 3115	Reward: 0.04500	Average Reward: 0.04200
Episode 3116	Reward: 0.04500	Average Reward: 0.04200
Episode 3117	Reward: 0.09500	Average Reward: 0.04250
Episode 3118	Reward: 0.09500	Average Reward: 0.04350
Episode 3119	Reward: 0.04500	Average Reward: 0.04350
Episode 3120	Reward: 0.04500	Average Reward: 0.04350
Episode 3121	Reward: -0.00500	Average Reward: 0.04300
Episode 3122	Reward: -0.00500	Average Reward: 0.04200
Episode 3123	Reward: 0.19500	Average Reward: 0.04350
Episode 3124	Reward: 0.09500	Average Reward: 0.04400
Episode 3125	Reward: 0.04500	Average Reward: 0.04400
Episode 3126	Reward: 0.04500	Average Rewar

Episode 3263	Reward: -0.00500	Average Reward: 0.04495
Episode 3264	Reward: 0.09500	Average Reward: 0.04595
Episode 3265	Reward: -0.00500	Average Reward: 0.04495
Episode 3266	Reward: 0.04500	Average Reward: 0.04495
Episode 3267	Reward: 0.04500	Average Reward: 0.04445
Episode 3268	Reward: 0.09500	Average Reward: 0.04545
Episode 3269	Reward: 0.04500	Average Reward: 0.04595
Episode 3270	Reward: 0.04500	Average Reward: 0.04595
Episode 3271	Reward: 0.04500	Average Reward: 0.04495
Episode 3272	Reward: 0.04500	Average Reward: 0.04495
Episode 3273	Reward: 0.04500	Average Reward: 0.04545
Episode 3274	Reward: 0.14500	Average Reward: 0.04645
Episode 3275	Reward: 0.04500	Average Reward: 0.04545
Episode 3276	Reward: 0.09500	Average Reward: 0.04595
Episode 3277	Reward: -0.00500	Average Reward: 0.04595
Episode 3278	Reward: 0.04500	Average Reward: 0.04595
Episode 3279	Reward: 0.09500	Average Reward: 0.04695
Episode 3280	Reward: -0.00500	Average Reward: 0.04695
Episode 3281	Reward: -0.00500	Average Rewa

Episode 3418	Reward: 0.09500	Average Reward: 0.04700
Episode 3419	Reward: 0.04500	Average Reward: 0.04700
Episode 3420	Reward: 0.04500	Average Reward: 0.04750
Episode 3421	Reward: 0.04500	Average Reward: 0.04750
Episode 3422	Reward: 0.04500	Average Reward: 0.04650
Episode 3423	Reward: 0.09500	Average Reward: 0.04700
Episode 3424	Reward: 0.04500	Average Reward: 0.04650
Episode 3425	Reward: 0.04500	Average Reward: 0.04700
Episode 3426	Reward: -0.00500	Average Reward: 0.04700
Episode 3427	Reward: 0.04500	Average Reward: 0.04700
Episode 3428	Reward: 0.04500	Average Reward: 0.04700
Episode 3429	Reward: 0.04500	Average Reward: 0.04700
Episode 3430	Reward: 0.04500	Average Reward: 0.04750
Episode 3431	Reward: 0.04500	Average Reward: 0.04750
Episode 3432	Reward: 0.04500	Average Reward: 0.04700
Episode 3433	Reward: 0.09500	Average Reward: 0.04800
Episode 3434	Reward: 0.04500	Average Reward: 0.04800
Episode 3435	Reward: 0.04500	Average Reward: 0.04800
Episode 3436	Reward: 0.04500	Average Reward: 

Episode 3572	Reward: -0.00500	Average Reward: 0.04595
Episode 3573	Reward: 0.04500	Average Reward: 0.04595
Episode 3574	Reward: 0.04500	Average Reward: 0.04595
Episode 3575	Reward: 0.04500	Average Reward: 0.04595
Episode 3576	Reward: 0.14500	Average Reward: 0.04695
Episode 3577	Reward: 0.09500	Average Reward: 0.04645
Episode 3578	Reward: -0.00500	Average Reward: 0.04545
Episode 3579	Reward: -0.00500	Average Reward: 0.04495
Episode 3580	Reward: 0.04500	Average Reward: 0.04495
Episode 3581	Reward: 0.04500	Average Reward: 0.04495
Episode 3582	Reward: -0.00500	Average Reward: 0.04445
Episode 3583	Reward: 0.04500	Average Reward: 0.04495
Episode 3584	Reward: 0.09500	Average Reward: 0.04595
Episode 3585	Reward: 0.04500	Average Reward: 0.04545
Episode 3586	Reward: 0.09500	Average Reward: 0.04645
Episode 3587	Reward: -0.00500	Average Reward: 0.04595
Episode 3588	Reward: 0.09500	Average Reward: 0.04645
Episode 3589	Reward: 0.04500	Average Reward: 0.04695
Episode 3590	Reward: 0.04500	Average Rewa

Episode 3727	Reward: 0.09500	Average Reward: 0.05190
Episode 3728	Reward: 0.09500	Average Reward: 0.05240
Episode 3729	Reward: 0.04500	Average Reward: 0.05240
Episode 3730	Reward: 0.14500	Average Reward: 0.05290
Episode 3731	Reward: 0.09500	Average Reward: 0.05290
Episode 3732	Reward: 0.09500	Average Reward: 0.05340
Episode 3733	Reward: 0.04500	Average Reward: 0.05390
Episode 3734	Reward: 0.04500	Average Reward: 0.05340
Episode 3735	Reward: 0.14500	Average Reward: 0.05390
Episode 3736	Reward: -0.00500	Average Reward: 0.05390
Episode 3737	Reward: 0.04500	Average Reward: 0.05440
Episode 3738	Reward: 0.09500	Average Reward: 0.05540
Episode 3739	Reward: 0.04500	Average Reward: 0.05540
Episode 3740	Reward: 0.04500	Average Reward: 0.05540
Episode 3741	Reward: 0.04500	Average Reward: 0.05540
Episode 3742	Reward: -0.00500	Average Reward: 0.05490
Episode 3743	Reward: 0.04500	Average Reward: 0.05490
Episode 3744	Reward: -0.00500	Average Reward: 0.05440
Episode 3745	Reward: 0.14500	Average Reward

In [None]:
env.close()