In [2]:
import gym
import numpy as np
import torch.nn as nn

In [3]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

In [4]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net,self).__init__()
        self.net = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, n_actions)
        )
        
    def forward(self,x):
        return self.net(x)
    

In [5]:
from collections import namedtuple

In [6]:
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation','action'])

In [7]:
import torch

In [8]:
x = torch.Tensor([1.0,2.0,3.0,4])
x

tensor([1., 2., 3., 4.])

In [9]:
xn = x.data.numpy()[0]
smm = nn.Softmax(dim=0)

In [10]:
xs = smm(x)
xs

tensor([0.0321, 0.0871, 0.2369, 0.6439])

In [11]:
xs.data.numpy()

array([0.0320586 , 0.08714432, 0.23688284, 0.6439143 ], dtype=float32)

In [12]:
def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs=env.reset()
    sm = nn.Softmax(dim=1)
    
    while True:
        
#         env.render()
        
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        
        action = np.random.choice(len(act_probs), p = act_probs)
        next_obs, rew, is_done, _ = env.step(action)
        
        episode_reward+=rew
        episode_steps.append(EpisodeStep(observation = obs, action = action))
        
        if is_done:
            
#             print(episode_reward)
            
            batch.append(Episode(reward = episode_reward, steps = episode_steps))
            episode_reward = 0.0
            episode_steps=[]
            next_obs = env.reset()
            if len(batch)==batch_size:
                yield batch
                batch = []
             
        obs = next_obs

In [13]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s:s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    train_obs = []
    train_act = []
    for example in batch:
        if example.reward<reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))
    
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [14]:
from tensorboardX import SummaryWriter
import torch.optim as optim

In [15]:
# main run

env = gym.make("CartPole-v0")
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params = net.parameters(), lr=0.01)
writer = SummaryWriter

In [16]:
#loop through
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    obs_v, act_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, act_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (iter_no, loss_v.item(), reward_m, reward_b))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    
    if reward_m > 199:
        print("Solved!")
        break
        writer.close()
        

0: loss=0.685, reward_mean=21.6, reward_bound=25.0


AttributeError: 'str' object has no attribute '_check_caffe2_blob'

In [170]:
net_trained = net

In [68]:
sample_obs = env.observation_space.sample()
sample_obs_v = torch.FloatTensor([sample_obs])
sm = nn.Softmax(dim=1)
sample_action_prob = sm(net_trained(sample_obs_v))
sample_action_probx = sm(net(sample_obs_v))
sample_action_prob

tensor([[0., 1.]], grad_fn=<SoftmaxBackward>)

In [124]:
sm=nn.Softmax(dim=1)

In [148]:
sample_obs = env.observation_space.sample()
sample_obs_v = torch.FloatTensor([sample_obs])
sample_obs_v

tensor([[-1.7086e+00, -2.5159e+38, -1.3974e-01,  1.7519e+38]])

In [149]:
net_trained(sample_obs_v)

tensor([[ 129666012503614031267066647994536296448.,
         -128649620393555838758781803338091462656.]],
       grad_fn=<ThAddmmBackward>)

In [161]:
net(sample_obs_v)

tensor([[-293982736963021820106449867453117759488.,
          293938845828639517891652445601334820864.]],
       grad_fn=<ThAddmmBackward>)

In [162]:
sm(net_trained(sample_obs_v))

tensor([[1., 0.]], grad_fn=<SoftmaxBackward>)

In [163]:
a = sm(net(sample_obs_v))

In [165]:
a

tensor([[0., 1.]], grad_fn=<SoftmaxBackward>)

In [164]:
a.data.numpy()[0]

array([0., 1.], dtype=float32)

In [172]:
def get_action(obs, net):
    
    obs_v = torch.FloatTensor([obs])
    act_probs_v = sm(net(obs_v))
    act_probs = act_probs_v.data.numpy()[0]

    action = np.random.choice(len(act_probs), p = act_probs)
    return action

In [34]:
for param in net.parameters():
    print(param.data.shape)
    print(param.data)


torch.Size([128, 4])
tensor([[-0.4165, -0.3100, -0.3523,  0.2744],
        [ 0.1138,  0.4158, -0.1633,  0.0167],
        [ 0.3345,  0.1123, -0.0536, -0.3350],
        [ 0.3959, -0.0379, -0.0983, -0.5723],
        [ 0.2356, -0.2427,  0.7050,  0.5805],
        [ 0.3378,  0.5010,  0.2678,  0.3262],
        [-0.1047, -0.1973,  0.0860,  0.6086],
        [ 0.1179,  0.4989,  0.0466, -0.1225],
        [-0.3666,  0.5163, -0.1876,  0.2111],
        [ 0.3868,  0.1244, -0.2217, -0.4546],
        [-0.0985, -0.2412,  0.6035,  0.5090],
        [ 0.3549,  0.4618, -0.6207, -0.2671],
        [ 0.1531, -0.0989, -0.5064,  0.3381],
        [ 0.3141,  0.1692,  0.0310, -0.3657],
        [-0.0933,  0.4379, -0.1776, -0.3629],
        [-0.4082,  0.0807, -0.1323, -0.3077],
        [ 0.0256, -0.5769, -0.5382, -0.2351],
        [-0.4523, -0.2651,  0.2338,  0.4431],
        [ 0.0078, -0.3460, -0.4165, -0.5049],
        [ 0.2957, -0.3941, -0.5956, -0.5030],
        [-0.2394, -0.1065, -0.7125, -0.5746],
        [ 0.2

In [173]:
# playing with trained agent
total_reward_a = []
total_steps = 0
# obs = env.reset()
sm = nn.Softmax(dim=1)

for i in range(BATCH_SIZE):
    total_reward = 0.0
    total_steps = 0
    obs = env.reset()
    while True:
        env.render()
        action = get_action(obs, net_trained)
#         action = env.action_space.sample()
        next_obs, rew, is_done, _ = env.step(action)
        total_reward+=rew
        total_steps+=1
        if is_done:
            print("iter",i," Total Reward = ",total_reward," Total Steps = ", total_steps)            
            total_reward_a.append(total_reward)
            break
        obs=next_obs

print('avg rew = ',sum(total_reward_a)/BATCH_SIZE)
env.close()

iter 0  Total Reward =  200.0  Total Steps =  200
iter 1  Total Reward =  200.0  Total Steps =  200
iter 2  Total Reward =  200.0  Total Steps =  200
iter 3  Total Reward =  198.0  Total Steps =  198
iter 4  Total Reward =  200.0  Total Steps =  200
iter 5  Total Reward =  200.0  Total Steps =  200
iter 6  Total Reward =  200.0  Total Steps =  200
iter 7  Total Reward =  200.0  Total Steps =  200
iter 8  Total Reward =  200.0  Total Steps =  200
iter 9  Total Reward =  200.0  Total Steps =  200
iter 10  Total Reward =  200.0  Total Steps =  200
iter 11  Total Reward =  200.0  Total Steps =  200
iter 12  Total Reward =  200.0  Total Steps =  200
iter 13  Total Reward =  200.0  Total Steps =  200
iter 14  Total Reward =  200.0  Total Steps =  200
iter 15  Total Reward =  200.0  Total Steps =  200
avg rew =  199.875
