DEEP REINFORCEMENT LEARNING EXPLAINED - 06
# **Solving Frozen-Lake Environment With Cross-Entropy Method**
## Agent Creation Using Deep Neural Networks

 

## The Environment

In [1]:
import numpy as np

import torch
import torch.nn as nn

In [2]:
import gym
import gym.spaces

env = gym.make('FrozenLake-v0', is_slippery=False)

In [3]:
class OneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(OneHotWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        r = np.copy(self.observation_space.low)
        r[observation] = 1.0
        return r

env = OneHotWrapper(env)

## The Agent
 ### The Model

In [4]:
obs_size = env.observation_space.shape[0] # 16
n_actions = env.action_space.n  # 4
HIDDEN_SIZE = 32


net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.Sigmoid(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )

### Get an Action

In [5]:
sm = nn.Softmax(dim=1)

def select_action(state):
        state_t = torch.FloatTensor([state])
        act_probs_t = sm(net(state_t))
        act_probs = act_probs_t.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        return action

### Optimizer and Loss function

In [6]:
import torch.optim as optim

objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

## Training the Agent

In [7]:
BATCH_SIZE = 100

GAMMA = 0.9

PERCENTILE = 30
REWARD_GOAL = 0.8

from collections import namedtuple

Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])


In [8]:
iter_no = 0
reward_mean = 0
full_batch = []
batch = []
episode_steps = []
episode_reward = 0.0
state = env.reset()
    
while reward_mean < REWARD_GOAL:
        action = select_action(state)
        next_state, reward, episode_is_done, _ = env.step(action)

        episode_steps.append(EpisodeStep(observation=state, action=action))
        episode_reward += reward
        
        if episode_is_done: # Episode finished            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            next_state = env.reset()
            episode_steps = []
            episode_reward = 0.0
             
            if len(batch) == BATCH_SIZE: # New set of batches ready --> select "elite"
                reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
                elite_candidates= batch 
                returnG = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), elite_candidates))
                reward_bound = np.percentile(returnG, PERCENTILE)

                train_obs = []
                train_act = []
                elite_batch = []
                for example, discounted_reward in zip(elite_candidates, returnG):
                        if discounted_reward > reward_bound:
                              train_obs.extend(map(lambda step: step.observation, example.steps))
                              train_act.extend(map(lambda step: step.action, example.steps))
                              elite_batch.append(example)
                full_batch=elite_batch
                state=train_obs
                acts=train_act

                
                if len(full_batch) != 0 : # just in case empty during an iteration
                 state_t = torch.FloatTensor(state)
                 acts_t = torch.LongTensor(acts)
                 optimizer.zero_grad()
                 action_scores_t = net(state_t)
                 loss_t = objective(action_scores_t, acts_t)
                 loss_t.backward()
                 optimizer.step()
                 print("%d: loss=%.3f, reward_mean=%.3f" % (iter_no, loss_t.item(), reward_mean))
                 iter_no += 1
                batch = []
        state = next_state


0: loss=1.352, reward_mean=0.010
1: loss=1.416, reward_mean=0.010
2: loss=1.434, reward_mean=0.020
3: loss=1.361, reward_mean=0.010
4: loss=1.366, reward_mean=0.010
5: loss=1.318, reward_mean=0.010
6: loss=1.321, reward_mean=0.020
7: loss=1.373, reward_mean=0.020
8: loss=1.370, reward_mean=0.030
9: loss=1.338, reward_mean=0.030
10: loss=1.297, reward_mean=0.010
11: loss=1.366, reward_mean=0.020
12: loss=1.322, reward_mean=0.020
13: loss=1.367, reward_mean=0.010
14: loss=1.345, reward_mean=0.010
15: loss=1.275, reward_mean=0.030
16: loss=1.350, reward_mean=0.010
17: loss=1.252, reward_mean=0.010
18: loss=1.274, reward_mean=0.050
19: loss=1.295, reward_mean=0.040
20: loss=1.306, reward_mean=0.030
21: loss=1.294, reward_mean=0.020
22: loss=1.282, reward_mean=0.060
23: loss=1.304, reward_mean=0.030
24: loss=1.293, reward_mean=0.020
25: loss=1.293, reward_mean=0.060
26: loss=1.279, reward_mean=0.060
27: loss=1.204, reward_mean=0.030
28: loss=1.271, reward_mean=0.040
29: loss=1.316, reward_m

239: loss=0.792, reward_mean=0.150
240: loss=0.858, reward_mean=0.090
241: loss=0.715, reward_mean=0.170
242: loss=0.705, reward_mean=0.210
243: loss=0.715, reward_mean=0.130
244: loss=0.736, reward_mean=0.210
245: loss=0.758, reward_mean=0.220
246: loss=0.736, reward_mean=0.200
247: loss=0.730, reward_mean=0.240
248: loss=0.727, reward_mean=0.220
249: loss=0.763, reward_mean=0.190
250: loss=0.727, reward_mean=0.240
251: loss=0.754, reward_mean=0.140
252: loss=0.739, reward_mean=0.200
253: loss=0.753, reward_mean=0.150
254: loss=0.701, reward_mean=0.180
255: loss=0.775, reward_mean=0.170
256: loss=0.745, reward_mean=0.280
257: loss=0.740, reward_mean=0.200
258: loss=0.688, reward_mean=0.240
259: loss=0.728, reward_mean=0.240
260: loss=0.730, reward_mean=0.180
261: loss=0.786, reward_mean=0.180
262: loss=0.809, reward_mean=0.270
263: loss=0.806, reward_mean=0.270
264: loss=0.765, reward_mean=0.230
265: loss=0.748, reward_mean=0.180
266: loss=0.786, reward_mean=0.200
267: loss=0.746, rew

475: loss=0.604, reward_mean=0.430
476: loss=0.630, reward_mean=0.390
477: loss=0.640, reward_mean=0.480
478: loss=0.553, reward_mean=0.430
479: loss=0.612, reward_mean=0.470
480: loss=0.572, reward_mean=0.450
481: loss=0.576, reward_mean=0.470
482: loss=0.576, reward_mean=0.450
483: loss=0.574, reward_mean=0.460
484: loss=0.607, reward_mean=0.410
485: loss=0.559, reward_mean=0.500
486: loss=0.582, reward_mean=0.450
487: loss=0.630, reward_mean=0.490
488: loss=0.604, reward_mean=0.450
489: loss=0.597, reward_mean=0.530
490: loss=0.613, reward_mean=0.380
491: loss=0.616, reward_mean=0.460
492: loss=0.570, reward_mean=0.550
493: loss=0.530, reward_mean=0.490
494: loss=0.602, reward_mean=0.490
495: loss=0.598, reward_mean=0.500
496: loss=0.578, reward_mean=0.430
497: loss=0.578, reward_mean=0.560
498: loss=0.545, reward_mean=0.460
499: loss=0.587, reward_mean=0.520
500: loss=0.588, reward_mean=0.510
501: loss=0.545, reward_mean=0.510
502: loss=0.579, reward_mean=0.510
503: loss=0.603, rew

## Test the Agent

In [9]:
test_env = OneHotWrapper(gym.make('FrozenLake-v0', is_slippery=False))
state= test_env.reset()
test_env.render()

is_done = False

while not is_done:
    action = select_action(state)
    new_state, reward, is_done, _ = test_env.step(action)
    test_env.render()
    state = new_state

print("reward = ", reward)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
reward =  1.0


----

DEEP REINFORCEMENT LEARNING EXPLAINED - 07
# **Cross-Entropy Method Performance Analysis**
## Implementation of the Cross-Entropy Training Loop

In [10]:
%load_ext tensorboard

ModuleNotFoundError: No module named 'tensorboard'

In [None]:
from torch.utils.tensorboard import SummaryWriter

def train_loop():
   writer = SummaryWriter(comment="-Frozen-Lake-nonslippery")

   iter_no = 0
   reward_mean = 0
   full_batch = []
   batch = []
   episode_steps = []
   episode_reward = 0.0
   state = env.reset()
    
   while reward_mean < REWARD_GOAL:
        action = select_action(state)
        next_state, reward, episode_is_done, _ = env.step(action)

        episode_steps.append(EpisodeStep(observation=state, action=action))
        episode_reward += reward
        
        if episode_is_done: # Episode finished            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            next_state = env.reset()
            episode_steps = []
            episode_reward = 0.0
             
            if len(batch) == BATCH_SIZE: # New set of batches ready --> select "elite"
                reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
                #elite_candidates= full_batch + batch 
                elite_candidates= batch 
                returnG = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), elite_candidates))
                reward_bound = np.percentile(returnG, PERCENTILE)

                train_obs = []
                train_act = []
                elite_batch = []
                for example, discounted_reward in zip(elite_candidates, returnG):
                        if discounted_reward > reward_bound:
                              train_obs.extend(map(lambda step: step.observation, example.steps))
                              train_act.extend(map(lambda step: step.action, example.steps))
                              elite_batch.append(example)
                full_batch=elite_batch
                state=train_obs
                acts=train_act

                
                if len(full_batch) != 0 : # just in case empty during an iteration
                       state_t = torch.FloatTensor(state)
                       acts_t = torch.LongTensor(acts)

                       optimizer.zero_grad()
                       action_scores_t = net(state_t)
                       loss_t = objective(action_scores_t, acts_t)
                       loss_t.backward()
                       optimizer.step()
                       writer.add_scalar("loss", loss_t.item(), iter_no)
                       writer.add_scalar("reward_mean", reward_mean, iter_no)
                       iter_no += 1
                batch = []
        state = next_state

   writer.close()


### Base line

In [None]:
HIDDEN_SIZE = 32
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.Sigmoid(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

train_loop()

In [None]:
tensorboard  --logdir=runs

### More complex Neural Network



In [None]:
HIDDEN_SIZE = 128
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.Sigmoid(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

train_loop()

In [None]:
tensorboard  --logdir=runs

### ReLU activation function

In [None]:
HIDDEN_SIZE = 128
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

train_loop()

In [None]:
tensorboard  --logdir=runs

### Improving Cross-Entropy Algorithm

In [None]:
def improved_train_loop():
   writer = SummaryWriter(comment="-Frozen-Lake-nonslippery")

   iter_no = 0
   reward_mean = 0
   full_batch = []
   batch = []
   episode_steps = []
   episode_reward = 0.0
   state = env.reset()
    
   while reward_mean < REWARD_GOAL:
        action = select_action(state)
        next_state, reward, episode_is_done, _ = env.step(action)

        episode_steps.append(EpisodeStep(observation=state, action=action))
        episode_reward += reward
        
        if episode_is_done: # Episode finished            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            next_state = env.reset()
            episode_steps = []
            episode_reward = 0.0
             
            if len(batch) == BATCH_SIZE: # New set of batches ready --> select "elite"
                reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))

                elite_candidates= full_batch + batch 
                #elite_candidates= batch 

                returnG = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), elite_candidates))
                reward_bound = np.percentile(returnG, PERCENTILE)

                train_obs = []
                train_act = []
                elite_batch = []
                for example, discounted_reward in zip(elite_candidates, returnG):
                        if discounted_reward > reward_bound:
                              train_obs.extend(map(lambda step: step.observation, example.steps))
                              train_act.extend(map(lambda step: step.action, example.steps))
                              elite_batch.append(example)
                full_batch=elite_batch
                state=train_obs
                acts=train_act

                
                if len(full_batch) != 0 : # just in case empty during an iteration
                       state_t = torch.FloatTensor(state)
                       acts_t = torch.LongTensor(acts)

                       optimizer.zero_grad()
                       action_scores_t = net(state_t)
                       loss_t = objective(action_scores_t, acts_t)
                       loss_t.backward()
                       optimizer.step()
                       writer.add_scalar("loss", loss_t.item(), iter_no)
                       writer.add_scalar("reward_mean", reward_mean, iter_no)
                       iter_no += 1
                batch = []
        state = next_state

   writer.close()

In [None]:
HIDDEN_SIZE = 128
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

improved_train_loop()

In [None]:
tensorboard  --logdir=runs

In [None]:
slippedy_env = gym.make('FrozenLake-v0', is_slippery=True)

class OneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(OneHotWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        r = np.copy(self.observation_space.low)
        r[observation] = 1.0
        return r

env = OneHotWrapper(slippedy_env)

HIDDEN_SIZE = 128
net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

improved_train_loop()

In [None]:
tensorboard  --logdir=runs