In [1]:
import gym
import numpy as np
import torch.nn as nn

In [2]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net,self).__init__()
        self.net = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, n_actions)
        )
        
    def forward(self,x):
        return self.net(x)
    

In [4]:
from collections import namedtuple

In [5]:
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation','action'])

In [6]:
import torch

In [7]:
x = torch.Tensor([1.0,2.0,3.0,4])
x

tensor([1., 2., 3., 4.])

In [8]:
xn = x.data.numpy()[0]
smm = nn.Softmax(dim=0)

In [9]:
xs = smm(x)
xs

tensor([0.0321, 0.0871, 0.2369, 0.6439])

In [10]:
xs.data.numpy()

array([0.0320586 , 0.08714432, 0.23688284, 0.6439143 ], dtype=float32)

In [11]:
def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs=env.reset()
    sm = nn.Softmax(dim=1)
    
    while True:
        
#         env.render()
        
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        
        action = np.random.choice(len(act_probs), p = act_probs)
        next_obs, rew, is_done, _ = env.step(action)
        
        episode_reward+=rew
        episode_steps.append(EpisodeStep(observation = obs, action = action))
        
        if is_done:
            
#             print(episode_reward)
            
            batch.append(Episode(reward = episode_reward, steps = episode_steps))
            episode_reward = 0.0
            episode_steps=[]
            next_obs = env.reset()
            if len(batch)==batch_size:
                yield batch
                batch = []
             
        obs = next_obs

In [12]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s:s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    train_obs = []
    train_act = []
    for example in batch:
        if example.reward<reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))
    
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [13]:
from tensorboardX import SummaryWriter
import torch.optim as optim

In [14]:
# main run

env = gym.make("CartPole-v0")
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params = net.parameters(), lr=0.01)
writer = SummaryWriter()

In [15]:
#loop through
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    obs_v, act_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, act_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (iter_no, loss_v.item(), reward_m, reward_b))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    
    if reward_m > 199:
        print("Solved!")
        break
        writer.close()
        

0: loss=0.691, reward_mean=20.3, reward_bound=23.5
1: loss=0.676, reward_mean=25.3, reward_bound=33.0
2: loss=0.678, reward_mean=33.6, reward_bound=41.5
3: loss=0.666, reward_mean=32.4, reward_bound=36.0
4: loss=0.659, reward_mean=27.1, reward_bound=29.0
5: loss=0.658, reward_mean=39.9, reward_bound=38.5
6: loss=0.639, reward_mean=43.5, reward_bound=59.5
7: loss=0.645, reward_mean=49.1, reward_bound=62.0
8: loss=0.639, reward_mean=57.2, reward_bound=76.5
9: loss=0.634, reward_mean=62.5, reward_bound=64.0
10: loss=0.631, reward_mean=62.4, reward_bound=69.5
11: loss=0.611, reward_mean=57.4, reward_bound=69.5
12: loss=0.625, reward_mean=69.8, reward_bound=85.5
13: loss=0.608, reward_mean=76.4, reward_bound=91.5
14: loss=0.606, reward_mean=78.9, reward_bound=86.0
15: loss=0.608, reward_mean=103.5, reward_bound=117.0
16: loss=0.598, reward_mean=84.7, reward_bound=107.0
17: loss=0.587, reward_mean=83.2, reward_bound=94.5
18: loss=0.599, reward_mean=101.4, reward_bound=130.5
19: loss=0.584, r

In [16]:
net_trained = net

In [17]:
sample_obs = env.observation_space.sample()
sample_obs_v = torch.FloatTensor([sample_obs])
sm = nn.Softmax(dim=1)
sample_action_prob = sm(net_trained(sample_obs_v))
sample_action_probx = sm(net(sample_obs_v))
sample_action_prob

tensor([[nan, nan]], grad_fn=<SoftmaxBackward>)

In [18]:
sm=nn.Softmax(dim=1)

In [19]:
sample_obs = env.observation_space.sample()
sample_obs_v = torch.FloatTensor([sample_obs])
sample_obs_v

tensor([[-2.9729e+00, -7.5835e+37,  2.5635e-02,  2.3618e+38]])

In [149]:
net_trained(sample_obs_v)

tensor([[ 129666012503614031267066647994536296448.,
         -128649620393555838758781803338091462656.]],
       grad_fn=<ThAddmmBackward>)

In [161]:
net(sample_obs_v)

tensor([[-293982736963021820106449867453117759488.,
          293938845828639517891652445601334820864.]],
       grad_fn=<ThAddmmBackward>)

In [162]:
sm(net_trained(sample_obs_v))

tensor([[1., 0.]], grad_fn=<SoftmaxBackward>)

In [163]:
a = sm(net(sample_obs_v))

In [165]:
a

tensor([[0., 1.]], grad_fn=<SoftmaxBackward>)

In [164]:
a.data.numpy()[0]

array([0., 1.], dtype=float32)

In [20]:
def get_action(obs, net):
    
    obs_v = torch.FloatTensor([obs])
    act_probs_v = sm(net(obs_v))
    act_probs = act_probs_v.data.numpy()[0]

    action = np.random.choice(len(act_probs), p = act_probs)
    return action

In [21]:
for param in net.parameters():
    print(param.data.shape)
    print(param.data)


torch.Size([128, 4])
tensor([[ 0.4520, -0.1001, -0.2828,  0.4591],
        [-0.1144, -0.2206, -0.0995,  0.4077],
        [ 0.0682, -0.2545,  0.0181, -0.2152],
        [ 0.2548, -0.0763, -0.0492, -0.6855],
        [-0.1500,  0.3953, -0.2284,  0.5169],
        [-0.1900,  0.1336,  0.1711,  0.5318],
        [ 0.2807,  0.3126, -0.1798, -0.6473],
        [-0.3601, -0.5242, -0.1519, -0.1650],
        [-0.0447, -0.2645,  0.1157, -0.1465],
        [ 0.1283, -0.0916,  0.0520,  0.2495],
        [ 0.0947,  0.0769, -0.5461,  0.4953],
        [ 0.0835, -0.4172, -0.4592, -0.3743],
        [ 0.0475,  0.1448, -0.2128,  0.5441],
        [-0.4228, -0.1739,  0.5728,  0.6046],
        [-0.4087,  0.3578,  0.1232,  0.5382],
        [-0.4521,  0.3350,  0.2618,  0.4170],
        [-0.1213, -0.3892,  0.0413,  0.6677],
        [-0.1138,  0.0048,  0.2613, -0.1409],
        [-0.1712, -0.2080,  0.0489,  0.5267],
        [-0.3367, -0.1048,  0.3936,  0.5652],
        [ 0.4768,  0.2750,  0.2368,  0.5588],
        [ 0.2

In [22]:
# playing with trained agent
total_reward_a = []
total_steps = 0
# obs = env.reset()
sm = nn.Softmax(dim=1)

for i in range(BATCH_SIZE):
    total_reward = 0.0
    total_steps = 0
    obs = env.reset()
    while True:
        env.render()
        action = get_action(obs, net_trained)
#         action = env.action_space.sample()
        next_obs, rew, is_done, _ = env.step(action)
        total_reward+=rew
        total_steps+=1
        if is_done:
            print("iter",i," Total Reward = ",total_reward," Total Steps = ", total_steps)            
            total_reward_a.append(total_reward)
            break
        obs=next_obs

print('avg rew = ',sum(total_reward_a)/BATCH_SIZE)
env.close()

iter 0  Total Reward =  200.0  Total Steps =  200
iter 1  Total Reward =  177.0  Total Steps =  177
iter 2  Total Reward =  190.0  Total Steps =  190
iter 3  Total Reward =  200.0  Total Steps =  200
iter 4  Total Reward =  200.0  Total Steps =  200
iter 5  Total Reward =  118.0  Total Steps =  118
iter 6  Total Reward =  200.0  Total Steps =  200
iter 7  Total Reward =  200.0  Total Steps =  200
iter 8  Total Reward =  200.0  Total Steps =  200
iter 9  Total Reward =  200.0  Total Steps =  200
iter 10  Total Reward =  200.0  Total Steps =  200
iter 11  Total Reward =  200.0  Total Steps =  200
iter 12  Total Reward =  200.0  Total Steps =  200
iter 13  Total Reward =  200.0  Total Steps =  200
iter 14  Total Reward =  200.0  Total Steps =  200
iter 15  Total Reward =  200.0  Total Steps =  200
avg rew =  192.8125
