In [1]:
import gym
import numpy as np
import torch.nn as nn

In [19]:
HIDDEN_SIZE = 128
BATCH_SIZE = 100
GAMMA = 0.9
PERCENTILE = 30

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net,self).__init__()
        self.net = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, n_actions)
        )
        
    def forward(self,x):
        return self.net(x)
    

In [4]:
from collections import namedtuple

In [5]:
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation','action'])

In [6]:
import torch

In [31]:
def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs=env.reset()
    sm = nn.Softmax(dim=1)
    
    while True:
        
#         if len(batch)==batch_size-1:
#             env.render()
        
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        
        action = np.random.choice(len(act_probs), p = act_probs)
        next_obs, rew, is_done, _ = env.step(action)
        
        episode_reward+=rew
        episode_steps.append(EpisodeStep(observation = obs, action = action))
        
        if is_done:
            
#             print(episode_reward)
            
            batch.append(Episode(reward = episode_reward, steps = episode_steps))
            episode_reward = 0.0
            episode_steps=[]
            next_obs = env.reset()
            if len(batch)==batch_size:
                yield batch
                batch = []
             
        obs = next_obs

In [32]:
def filter_batch(batch, percentile):
    disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch))
    reward_bound = np.percentile(disc_rewards, percentile)
#     reward_mean = float(np.mean(rewards))
    train_obs = []
    train_act = []
    elite_batch=[]
    for example, discounted_reward in zip(batch, disc_rewards):
        if discounted_reward>reward_bound:
            train_obs.extend(map(lambda step: step.observation, example.steps))
            train_act.extend(map(lambda step: step.action, example.steps))
            elite_batch.append(example)
    
#     train_obs_v = torch.FloatTensor(train_obs)
#     train_act_v = torch.LongTensor(train_act)
#     return train_obs_v, train_act_v, reward_bound, reward_mean
    return elite_batch, train_obs, train_act, reward_bound

In [9]:
from tensorboardX import SummaryWriter
import torch.optim as optim

In [10]:
#explore env

env = gym.make("FrozenLake-v0")
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [15]:
env.observation_space

Discrete(16)

In [13]:
env.action_space

Discrete(4)

In [33]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)
    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res

In [36]:
import random
# main run
random.seed(12345)
# # env = gym.make("FrozenLake-v0")
# env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))

#non slippery version
env = gym.envs.toy_text.frozen_lake.FrozenLakeEnv(is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps=100)
env = DiscreteOneHotWrapper(env)

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params = net.parameters(), lr=0.001)
writer = SummaryWriter(comment="-frozenlake-nonslip-tweaked")

In [38]:
#loop through

full_batch = []
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    reward_m = float(np.mean(list(map(lambda s: s.reward, batch))))
    full_batch, obs, acts, reward_b = filter_batch(full_batch+batch, PERCENTILE)
    if not full_batch:
        continue
    obs_v = torch.FloatTensor(obs)
    act_v = torch.LongTensor(acts)
    
    full_batch = full_batch[-500:]
    
    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, act_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f ,batch=%d"% (
        iter_no, loss_v.item(), reward_m, reward_b,len(full_batch)))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    
    if reward_m > 0.8:
        print("Solved!")
        break
        
writer.close()
   

0: loss=1.273, reward_mean=0.0, reward_bound=0.0 ,batch=2
1: loss=1.298, reward_mean=0.1, reward_bound=0.0 ,batch=7
2: loss=1.305, reward_mean=0.0, reward_bound=0.0 ,batch=11
3: loss=1.301, reward_mean=0.0, reward_bound=0.0 ,batch=11
4: loss=1.296, reward_mean=0.0, reward_bound=0.0 ,batch=13
5: loss=1.275, reward_mean=0.1, reward_bound=0.0 ,batch=20
6: loss=1.282, reward_mean=0.0, reward_bound=0.0 ,batch=24
7: loss=1.275, reward_mean=0.1, reward_bound=0.0 ,batch=31
8: loss=1.266, reward_mean=0.1, reward_bound=0.0 ,batch=38
9: loss=1.261, reward_mean=0.0, reward_bound=0.0 ,batch=41
10: loss=1.260, reward_mean=0.1, reward_bound=0.0 ,batch=49
11: loss=1.255, reward_mean=0.0, reward_bound=0.0 ,batch=53
12: loss=1.247, reward_mean=0.0, reward_bound=0.0 ,batch=57
13: loss=1.239, reward_mean=0.1, reward_bound=0.0 ,batch=62
14: loss=1.234, reward_mean=0.0, reward_bound=0.0 ,batch=64
15: loss=1.225, reward_mean=0.0, reward_bound=0.0 ,batch=68
16: loss=1.218, reward_mean=0.1, reward_bound=0.0 ,b

In [39]:
def get_action(obs, net):
    
    obs_v = torch.FloatTensor([obs])
    act_probs_v = sm(net(obs_v))
    act_probs = act_probs_v.data.numpy()[0]

    action = np.random.choice(len(act_probs), p = act_probs)
    return action

net_trained = net

In [42]:


# playing with trained agent
total_reward_a = []
total_steps = 0
# obs = env.reset()
sm = nn.Softmax(dim=1)

for i in range(200):
    total_reward = 0.0
    total_steps = 0
    obs = env.reset()
    while True:
#         env.render()
        action = get_action(obs, net_trained)
#         action = env.action_space.sample()
        next_obs, rew, is_done, _ = env.step(action)
        total_reward+=rew
        total_steps+=1
        if is_done:
            print("iter",i," Total Reward = ",total_reward," Total Steps = ", total_steps)            
            total_reward_a.append(total_reward)
            break
        obs=next_obs

print('avg rew = ',sum(total_reward_a)/BATCH_SIZE)
env.close()

iter 0  Total Reward =  1.0  Total Steps =  6
iter 1  Total Reward =  1.0  Total Steps =  6
iter 2  Total Reward =  1.0  Total Steps =  6
iter 3  Total Reward =  1.0  Total Steps =  6
iter 4  Total Reward =  1.0  Total Steps =  8
iter 5  Total Reward =  1.0  Total Steps =  6
iter 6  Total Reward =  1.0  Total Steps =  6
iter 7  Total Reward =  0.0  Total Steps =  5
iter 8  Total Reward =  1.0  Total Steps =  6
iter 9  Total Reward =  1.0  Total Steps =  6
iter 10  Total Reward =  1.0  Total Steps =  6
iter 11  Total Reward =  1.0  Total Steps =  6
iter 12  Total Reward =  1.0  Total Steps =  12
iter 13  Total Reward =  0.0  Total Steps =  2
iter 14  Total Reward =  1.0  Total Steps =  9
iter 15  Total Reward =  0.0  Total Steps =  3
iter 16  Total Reward =  1.0  Total Steps =  6
iter 17  Total Reward =  1.0  Total Steps =  8
iter 18  Total Reward =  1.0  Total Steps =  6
iter 19  Total Reward =  1.0  Total Steps =  6
iter 20  Total Reward =  1.0  Total Steps =  6
iter 21  Total Reward 

iter 182  Total Reward =  1.0  Total Steps =  6
iter 183  Total Reward =  1.0  Total Steps =  8
iter 184  Total Reward =  1.0  Total Steps =  6
iter 185  Total Reward =  1.0  Total Steps =  6
iter 186  Total Reward =  1.0  Total Steps =  8
iter 187  Total Reward =  1.0  Total Steps =  8
iter 188  Total Reward =  0.0  Total Steps =  3
iter 189  Total Reward =  0.0  Total Steps =  4
iter 190  Total Reward =  1.0  Total Steps =  6
iter 191  Total Reward =  1.0  Total Steps =  8
iter 192  Total Reward =  0.0  Total Steps =  2
iter 193  Total Reward =  1.0  Total Steps =  6
iter 194  Total Reward =  1.0  Total Steps =  6
iter 195  Total Reward =  1.0  Total Steps =  6
iter 196  Total Reward =  0.0  Total Steps =  3
iter 197  Total Reward =  1.0  Total Steps =  6
iter 198  Total Reward =  0.0  Total Steps =  5
iter 199  Total Reward =  1.0  Total Steps =  6
avg rew =  1.56
