In [125]:
import math
import sys
print(sys.executable)

from collections import namedtuple
import numpy as np

import gym

from torch import FloatTensor, LongTensor
import torch.nn as nn
import torch.optim as optim

from tensorboardX import SummaryWriter

HIDDEN_SIZE = 128
BATCH_SIZE = 100
PERCENTILE = 30 # 70 for catpole
GAMMA = 0.95


C:\ProgramData\Anaconda3\envs\py36\python.exe


In [126]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    
    while True:
        #print(obs)
        # get current obervation
        obs_v = FloatTensor([obs])
        ## feed forward
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        #print(act_probs)
        action = np.random.choice(len(act_probs), p=act_probs)
        #print("action:",action)
        next_obs, reward, is_done, _ = env.step(action)
        #discounting
        episode_reward += (reward * (GAMMA ** len(episode_steps)))
        # accumulate episide steps till done
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if is_done:
            # save episode
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            # yield batches of requested size
            if len(batch) == batch_size:
                yield batch
                batch = []
        
        # update to next obs
        obs = next_obs

In [None]:
elite_batch=[]
"""
batch - new episodes
elite_batch - rare good episodes
"""
def filter_batch(batch, percentile, elite_batch):
    batch = elite_batch + batch
    rewards = list(map(lambda s: s.reward, batch))
    #print(rewards)
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        #print("example.reward", example.reward)
        # accumulate observations    
        train_obs.extend(map(lambda step: step.observation, example.steps))
        # accumulate actiona
        train_act.extend(map(lambda step: step.action, example.steps))
        if example.reward > 0.01:
            elite_batch.append(example)
        
    #make tenors
    train_obs_v = FloatTensor(train_obs)
    train_act_v = LongTensor(train_act)
    # keep 500 items
    del(elite_batch[0:-500])
    print(len(elite_batch))
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [132]:
def train(env, target_reward, lr = 0.05):
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=lr)
    writer = SummaryWriter(env.name if "name" in dir(env) else "")

    elite_batch=[]
    for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        if iter_no > 2000:
            print("No convergence!")
            break
        
        obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE, elite_batch)
        optimizer.zero_grad()
        action_scores_v = net(obs_v)
        loss_v = objective(action_scores_v, acts_v)
        loss_v.backward()
        optimizer.step()
        print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
    
        if reward_m > target_reward:
            print("Solved!")
            break
    writer.close()

In [57]:
env = gym.make("CartPole-v0",199)
train(env)

[63.0, 10.0, 19.0, 20.0, 9.0, 20.0, 17.0, 19.0, 23.0, 15.0, 17.0, 20.0, 18.0, 17.0, 16.0, 30.0]
0: loss=0.687, reward_mean=20.8, reward_bound=20.0
[14.0, 13.0, 10.0, 11.0, 12.0, 11.0, 13.0, 10.0, 11.0, 12.0, 10.0, 10.0, 11.0, 11.0, 11.0, 12.0]
1: loss=0.554, reward_mean=11.4, reward_bound=12.0
[19.0, 29.0, 22.0, 13.0, 23.0, 29.0, 30.0, 20.0, 24.0, 31.0, 17.0, 40.0, 16.0, 14.0, 21.0, 16.0]
2: loss=0.610, reward_mean=22.8, reward_bound=26.5
[42.0, 35.0, 84.0, 54.0, 54.0, 47.0, 64.0, 72.0, 29.0, 55.0, 43.0, 36.0, 38.0, 32.0, 91.0, 38.0]
3: loss=0.539, reward_mean=50.9, reward_bound=54.5
[38.0, 49.0, 37.0, 33.0, 34.0, 44.0, 35.0, 44.0, 41.0, 30.0, 79.0, 35.0, 51.0, 44.0, 105.0, 26.0]
4: loss=0.527, reward_mean=45.3, reward_bound=44.0
[28.0, 27.0, 53.0, 49.0, 33.0, 35.0, 30.0, 42.0, 34.0, 35.0, 43.0, 35.0, 46.0, 37.0, 61.0, 31.0]
5: loss=0.448, reward_mean=38.7, reward_bound=42.5
[38.0, 33.0, 25.0, 32.0, 36.0, 38.0, 59.0, 35.0, 35.0, 48.0, 50.0, 37.0, 41.0, 42.0, 49.0, 109.0]
6: loss=0.390,

[155.0, 83.0, 51.0, 99.0, 71.0, 120.0, 100.0, 60.0, 67.0, 56.0, 59.0, 64.0, 55.0, 64.0, 71.0, 57.0]
55: loss=0.078, reward_mean=77.0, reward_bound=77.0
[47.0, 56.0, 103.0, 52.0, 63.0, 55.0, 46.0, 40.0, 73.0, 43.0, 79.0, 108.0, 62.0, 83.0, 56.0, 44.0]
56: loss=0.053, reward_mean=63.1, reward_bound=68.0
[95.0, 94.0, 91.0, 92.0, 46.0, 67.0, 74.0, 68.0, 69.0, 59.0, 70.0, 48.0, 95.0, 68.0, 42.0, 106.0]
57: loss=0.065, reward_mean=74.0, reward_bound=91.5
[46.0, 48.0, 54.0, 98.0, 50.0, 44.0, 177.0, 45.0, 48.0, 47.0, 68.0, 55.0, 44.0, 44.0, 104.0, 59.0]
58: loss=0.088, reward_mean=64.4, reward_bound=57.0
[47.0, 62.0, 76.0, 54.0, 78.0, 67.0, 61.0, 62.0, 49.0, 48.0, 48.0, 75.0, 86.0, 52.0, 57.0, 81.0]
59: loss=0.090, reward_mean=62.7, reward_bound=71.0
[132.0, 61.0, 55.0, 69.0, 78.0, 106.0, 70.0, 59.0, 58.0, 56.0, 85.0, 86.0, 91.0, 50.0, 74.0, 58.0]
60: loss=0.055, reward_mean=74.2, reward_bound=81.5
[74.0, 46.0, 82.0, 85.0, 50.0, 48.0, 57.0, 62.0, 112.0, 49.0, 82.0, 53.0, 113.0, 71.0, 42.0, 66.

[78.0, 65.0, 100.0, 69.0, 79.0, 131.0, 55.0, 61.0, 73.0, 83.0, 63.0, 145.0, 69.0, 63.0, 55.0, 54.0]
108: loss=0.065, reward_mean=77.7, reward_bound=78.5
[72.0, 90.0, 200.0, 53.0, 193.0, 79.0, 55.0, 53.0, 55.0, 200.0, 66.0, 145.0, 95.0, 79.0, 85.0, 53.0]
109: loss=0.092, reward_mean=98.3, reward_bound=92.5
[84.0, 136.0, 52.0, 73.0, 86.0, 81.0, 66.0, 78.0, 71.0, 186.0, 81.0, 76.0, 76.0, 83.0, 59.0, 61.0]
110: loss=0.072, reward_mean=84.3, reward_bound=82.0
[200.0, 59.0, 62.0, 200.0, 72.0, 161.0, 63.0, 80.0, 61.0, 70.0, 60.0, 70.0, 181.0, 70.0, 89.0, 123.0]
111: loss=0.092, reward_mean=101.3, reward_bound=106.0
[200.0, 68.0, 200.0, 200.0, 77.0, 74.0, 64.0, 71.0, 200.0, 64.0, 73.0, 162.0, 74.0, 98.0, 200.0, 105.0]
112: loss=0.156, reward_mean=120.6, reward_bound=181.0
[78.0, 97.0, 80.0, 93.0, 104.0, 90.0, 84.0, 87.0, 68.0, 81.0, 200.0, 80.0, 65.0, 83.0, 174.0, 96.0]
113: loss=0.094, reward_mean=97.5, reward_bound=94.5
[108.0, 67.0, 63.0, 61.0, 58.0, 62.0, 63.0, 77.0, 75.0, 77.0, 200.0, 74.

# Naive FrozenLake

In [112]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res
    


In [113]:
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))
train(env,0.5)

500
0: loss=1.344, reward_mean=0.5, reward_bound=0.6
500
1: loss=1.039, reward_mean=0.5, reward_bound=0.6
500
2: loss=0.846, reward_mean=0.5, reward_bound=0.6
500
3: loss=0.625, reward_mean=0.5, reward_bound=0.6
500
4: loss=0.491, reward_mean=0.5, reward_bound=0.6
500
5: loss=0.443, reward_mean=0.5, reward_bound=0.6
500
6: loss=0.419, reward_mean=0.5, reward_bound=0.6
500
7: loss=0.403, reward_mean=0.5, reward_bound=0.6
500
8: loss=0.404, reward_mean=0.5, reward_bound=0.6
500
9: loss=0.400, reward_mean=0.5, reward_bound=0.6
500
10: loss=0.397, reward_mean=0.5, reward_bound=0.6
500
11: loss=0.400, reward_mean=0.5, reward_bound=0.6
500
12: loss=0.401, reward_mean=0.5, reward_bound=0.6
500
13: loss=0.393, reward_mean=0.5, reward_bound=0.6
500
14: loss=0.388, reward_mean=0.5, reward_bound=0.6
500
15: loss=0.379, reward_mean=0.5, reward_bound=0.6
500
16: loss=0.379, reward_mean=0.5, reward_bound=0.6
500
17: loss=0.378, reward_mean=0.5, reward_bound=0.6
500
18: loss=0.379, reward_mean=0.5, r

KeyboardInterrupt: 

# Non slippary FrozenLake

In [135]:
env = gym.envs.toy_text.frozen_lake.FrozenLakeEnv(is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps=100)
env = DiscreteOneHotWrapper(env)

train(env, 0.79, 0.001)

100
0: loss=1.382, reward_mean=0.000, reward_bound=0.000
300
1: loss=1.382, reward_mean=0.000, reward_bound=0.000
500
2: loss=1.382, reward_mean=0.001, reward_bound=0.000
500
3: loss=1.382, reward_mean=0.004, reward_bound=0.000
500
4: loss=1.382, reward_mean=0.005, reward_bound=0.000
500
5: loss=1.383, reward_mean=0.007, reward_bound=0.000
500
6: loss=1.384, reward_mean=0.008, reward_bound=0.000
500
7: loss=1.384, reward_mean=0.010, reward_bound=0.000
500
8: loss=1.382, reward_mean=0.011, reward_bound=0.000
500
9: loss=1.382, reward_mean=0.010, reward_bound=0.000
500
10: loss=1.382, reward_mean=0.008, reward_bound=0.000
500
11: loss=1.382, reward_mean=0.007, reward_bound=0.000
500
12: loss=1.380, reward_mean=0.005, reward_bound=0.000
500
13: loss=1.380, reward_mean=0.005, reward_bound=0.000
500
14: loss=1.380, reward_mean=0.006, reward_bound=0.000
500
15: loss=1.380, reward_mean=0.005, reward_bound=0.000
500
16: loss=1.380, reward_mean=0.007, reward_bound=0.000
500
17: loss=1.379, rewa

500
141: loss=1.350, reward_mean=0.003, reward_bound=0.000
500
142: loss=1.352, reward_mean=0.003, reward_bound=0.000
500
143: loss=1.351, reward_mean=0.002, reward_bound=0.000
500
144: loss=1.351, reward_mean=0.003, reward_bound=0.000
500
145: loss=1.350, reward_mean=0.002, reward_bound=0.000
500
146: loss=1.349, reward_mean=0.001, reward_bound=0.000
500
147: loss=1.349, reward_mean=0.002, reward_bound=0.000
500
148: loss=1.350, reward_mean=0.002, reward_bound=0.000
500
149: loss=1.347, reward_mean=0.002, reward_bound=0.000
500
150: loss=1.347, reward_mean=0.004, reward_bound=0.000
500
151: loss=1.345, reward_mean=0.004, reward_bound=0.000
500
152: loss=1.344, reward_mean=0.003, reward_bound=0.000
500
153: loss=1.343, reward_mean=0.004, reward_bound=0.000
500
154: loss=1.339, reward_mean=0.004, reward_bound=0.000
500
155: loss=1.340, reward_mean=0.004, reward_bound=0.000
500
156: loss=1.338, reward_mean=0.003, reward_bound=0.000
500
157: loss=1.338, reward_mean=0.003, reward_bound=0.0

500
280: loss=1.290, reward_mean=0.003, reward_bound=0.000
500
281: loss=1.284, reward_mean=0.002, reward_bound=0.000
500
282: loss=1.280, reward_mean=0.001, reward_bound=0.000
500
283: loss=1.281, reward_mean=0.001, reward_bound=0.000
500
284: loss=1.278, reward_mean=0.000, reward_bound=0.000
500
285: loss=1.277, reward_mean=0.001, reward_bound=0.000
500
286: loss=1.279, reward_mean=0.001, reward_bound=0.000
500
287: loss=1.278, reward_mean=0.001, reward_bound=0.000
500
288: loss=1.275, reward_mean=0.001, reward_bound=0.000
500
289: loss=1.277, reward_mean=0.001, reward_bound=0.000
500
290: loss=1.274, reward_mean=0.001, reward_bound=0.000
500
291: loss=1.274, reward_mean=0.000, reward_bound=0.000
500
292: loss=1.268, reward_mean=0.000, reward_bound=0.000
500
293: loss=1.270, reward_mean=0.000, reward_bound=0.000
500
294: loss=1.270, reward_mean=0.000, reward_bound=0.000
500
295: loss=1.266, reward_mean=0.000, reward_bound=0.000
500
296: loss=1.268, reward_mean=0.000, reward_bound=0.0

KeyboardInterrupt: 