In [1]:
import gym
import numpy as np
import torch.nn as nn

In [2]:
import torch
print(torch.__version__)

0.4.1


In [40]:
class Net(nn.Module):
    def __init__(self,obs_size, hidden_size,n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(nn.Linear(obs_size,hidden_size),
                                nn.Tanh(),
                                nn.Linear(hidden_size,n_actions),
                                nn.Tanh())
    
    def forward(self,x):
        return self.net(x)*2

In [4]:
from torch.distributions.normal import Normal
import torch

In [38]:
def get_policy(net, obs):
    mu = net(obs)
    return Normal(loc=mu, scale = STD)

def get_action(net, obs):
    policy = get_policy(net, obs)
    act = policy.sample().item()
#     if act<0:
#         print ("yes negative")
    return np.array([act])

def reward_to_go(rews):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
    return rtgs

def reward_to_go_avg(rews, avg):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0) - avg/n
    return rtgs

In [12]:
def compute_loss(obs, acts, wts, net):
#     obs_v = torch.FloatTensor(obs)
    policy = get_policy(net, obs)
    log_p = policy.log_prob(acts)
    return -(log_p*wts).mean()

In [57]:
def train_one_epoch(env, net, lr=1e-2, batch_size=5000, render=False):
    batch_obs = []
    batch_wts = []
    batch_acts = []
    batch_rets = []
    batch_len = []
    eps_rew = []
    obs = env.reset()
    done=False
    epoch_finished_rendering = False
    
    while True:
        if not epoch_finished_rendering and render:
            env.render()
        
        act = get_action(net = net, obs=torch.as_tensor(obs, dtype=torch.float32))
        batch_obs.append(obs.copy())
        batch_acts.append(act)
        
        obs,rew,done,_ = env.step(act)
        
        eps_rew.append(rew)
        
#         obs= next_obs
        
        if done:
            eps_ret = sum(eps_rew)
            eps_len = len(eps_rew)
            batch_rets.append(eps_ret)
            batch_len.append(eps_len)
            
#             batch_wts = batch_wts + [eps_ret-avg_rew]*eps_len
            
            #rtg
            
            batch_wts = batch_wts + list(reward_to_go(eps_rew))
    
            eps_rew = []
            done = False
            
            obs = env.reset()
            epoch_finished_rendering = True
            
            if len(batch_obs)>batch_size:
                break
    
    optimizer.zero_grad()
    batch_loss = compute_loss(obs = torch.as_tensor(batch_obs, dtype=torch.float32),
                              acts = torch.as_tensor(batch_acts, dtype = torch.float32),
                              wts = torch.as_tensor(batch_wts, dtype = torch.float32),
                             net = net)
    batch_loss.backward()
    optimizer.step()
    return batch_loss,batch_rets, batch_len, batch_obs, batch_acts

In [58]:
env = gym.make('Pendulum-v0')
env.reset()
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

obs_size, n_actions



(3, 1)

In [59]:
env.observation_space

Box(3,)

In [60]:
env.action_space

Box(1,)

In [61]:
HIDDEN_SIZE = 32
BATCH_SIZE = 500
STD = 0.1

In [62]:
net = Net(obs_size = obs_size, hidden_size = HIDDEN_SIZE, n_actions= n_actions)

In [63]:
obs = env.observation_space.sample()
obs

array([ 0.8791483 , -0.52392477, -0.528469  ], dtype=float32)

In [64]:
act = get_action(net, obs=torch.as_tensor(obs, dtype=torch.float32))
act

array([0.66151482])

In [65]:
# testing net for giving -ve actions
count = 0
acts = []
for i in range(0,10000):
    obs = env.observation_space.sample()
    act = get_action(net, obs=torch.as_tensor(obs, dtype=torch.float32))
    acts.append(act[0])
    if act[0]<0:
        count+=1
count, max(acts),min(acts), np.mean(acts)

(4070, 1.154033899307251, -0.9292287230491638, 0.21791781996460632)

In [25]:
a = env.action_space.sample()
a

array([-1.8395157], dtype=float32)

In [79]:
env.step(np.array([act]))

(array([[-0.97085151],
        [ 0.23968176],
        [ 0.3240764 ]]),
 array([-8.31567152]),
 False,
 {})

In [66]:
from torch.optim import Adam
lr = 1e-2
optimizer = Adam(net.parameters(), lr=lr)

In [67]:
from tensorboardX import SummaryWriter
writer = SummaryWriter(comment="-vanilla_policy_grad_pendulum")

In [68]:
# train

rew_req = 200
i=0
mean_rew = 0
mean_rew_sum = 0
avg_rew=0

while i<500:
    i+=1
    render = True if i%50==0 else False
#     render = False
    batch_loss,batch_ret, batch_len, obs, acts = train_one_epoch(env, net,batch_size=BATCH_SIZE, render=render)
    mean_rew = np.mean(batch_ret)
    mean_rew_sum += mean_rew
    avg_rew = mean_rew_sum/i
    
    if render:
        env.close()
    print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
                (i, batch_loss, np.mean(batch_ret), np.mean(batch_len)))
    writer.add_scalar("loss", batch_loss, i)
    writer.add_scalar("reward_mean", mean_rew, i)

epoch:   1 	 loss: 794.376 	 return: -1591.808 	 ep_len: 200.000
epoch:   2 	 loss: 791.970 	 return: -1815.895 	 ep_len: 200.000
epoch:   3 	 loss: 822.486 	 return: -1596.419 	 ep_len: 200.000
epoch:   4 	 loss: 830.146 	 return: -1775.162 	 ep_len: 200.000
epoch:   5 	 loss: 772.236 	 return: -1688.951 	 ep_len: 200.000
epoch:   6 	 loss: 781.073 	 return: -1700.107 	 ep_len: 200.000
epoch:   7 	 loss: 809.624 	 return: -1730.106 	 ep_len: 200.000
epoch:   8 	 loss: 807.312 	 return: -1703.212 	 ep_len: 200.000
epoch:   9 	 loss: 773.556 	 return: -1647.466 	 ep_len: 200.000
epoch:  10 	 loss: 755.512 	 return: -1581.642 	 ep_len: 200.000
epoch:  11 	 loss: 736.659 	 return: -1596.090 	 ep_len: 200.000
epoch:  12 	 loss: 747.811 	 return: -1554.769 	 ep_len: 200.000
epoch:  13 	 loss: 741.158 	 return: -1622.474 	 ep_len: 200.000
epoch:  14 	 loss: 722.535 	 return: -1574.599 	 ep_len: 200.000
epoch:  15 	 loss: 743.212 	 return: -1513.065 	 ep_len: 200.000
epoch:  16 	 loss: 755.68

epoch: 128 	 loss: 775.789 	 return: -1622.908 	 ep_len: 200.000
epoch: 129 	 loss: 715.902 	 return: -1596.653 	 ep_len: 200.000
epoch: 130 	 loss: 751.007 	 return: -1647.389 	 ep_len: 200.000
epoch: 131 	 loss: 750.694 	 return: -1621.438 	 ep_len: 200.000
epoch: 132 	 loss: 780.184 	 return: -1585.266 	 ep_len: 200.000
epoch: 133 	 loss: 786.993 	 return: -1614.500 	 ep_len: 200.000
epoch: 134 	 loss: 722.932 	 return: -1556.709 	 ep_len: 200.000
epoch: 135 	 loss: 724.506 	 return: -1613.115 	 ep_len: 200.000
epoch: 136 	 loss: 707.734 	 return: -1600.408 	 ep_len: 200.000
epoch: 137 	 loss: 735.292 	 return: -1581.160 	 ep_len: 200.000
epoch: 138 	 loss: 739.357 	 return: -1585.667 	 ep_len: 200.000
epoch: 139 	 loss: 757.552 	 return: -1607.242 	 ep_len: 200.000
epoch: 140 	 loss: 735.749 	 return: -1633.336 	 ep_len: 200.000
epoch: 141 	 loss: 776.574 	 return: -1625.076 	 ep_len: 200.000
epoch: 142 	 loss: 745.590 	 return: -1630.139 	 ep_len: 200.000
epoch: 143 	 loss: 756.16

epoch: 255 	 loss: 715.229 	 return: -1569.614 	 ep_len: 200.000
epoch: 256 	 loss: 749.753 	 return: -1631.697 	 ep_len: 200.000
epoch: 257 	 loss: 732.895 	 return: -1585.496 	 ep_len: 200.000
epoch: 258 	 loss: 717.158 	 return: -1550.646 	 ep_len: 200.000
epoch: 259 	 loss: 735.212 	 return: -1643.889 	 ep_len: 200.000
epoch: 260 	 loss: 790.661 	 return: -1643.904 	 ep_len: 200.000
epoch: 261 	 loss: 756.890 	 return: -1589.211 	 ep_len: 200.000
epoch: 262 	 loss: 753.881 	 return: -1620.524 	 ep_len: 200.000
epoch: 263 	 loss: 753.553 	 return: -1614.737 	 ep_len: 200.000
epoch: 264 	 loss: 750.758 	 return: -1648.153 	 ep_len: 200.000
epoch: 265 	 loss: 682.165 	 return: -1586.430 	 ep_len: 200.000
epoch: 266 	 loss: 704.101 	 return: -1585.534 	 ep_len: 200.000
epoch: 267 	 loss: 782.972 	 return: -1606.782 	 ep_len: 200.000
epoch: 268 	 loss: 783.081 	 return: -1604.844 	 ep_len: 200.000
epoch: 269 	 loss: 709.406 	 return: -1601.346 	 ep_len: 200.000
epoch: 270 	 loss: 751.87

KeyboardInterrupt: 

In [None]:
env.close()

# ScratchPad

In [5]:
c = Normal(loc = torch.FloatTensor([1.0]), scale = torch.Tensor([1.0]))
c

Normal()

In [6]:
c.sample()

tensor([1.7945])

In [7]:
count = [0,0,0]
for i in range(0,100):
    x = c.sample().item()
    count[x]+=1
count

TypeError: list indices must be integers or slices, not float

In [8]:
c.log_prob(torch.Tensor(np.array([0.58])))

tensor([-1.0071])

In [105]:
x =([2]*8)
x

[2, 2, 2, 2, 2, 2, 2, 2]

In [106]:
y = ([1]*3 + [2]*5)
y

[1, 1, 1, 2, 2, 2, 2, 2]

In [9]:
(torch.FloatTensor(x)*torch.FloatTensor(y)).mean()

TypeError: new(): data must be a sequence (got float)

In [112]:
c=0
for x in acts:
    if x<0:
        c+=1
c

0

In [254]:
wts

[13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 13.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 23.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 15.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 21.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 18.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 25.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,
 20.0,

In [202]:
x = [1,2,3]
y = []
y.append(x)
y

[[1, 2, 3]]

In [203]:
x=[4,2,3]
y

[[1, 2, 3]]

In [204]:
y.append(x.copy())

In [205]:
y

[[1, 2, 3], [4, 2, 3]]

In [206]:
y.append(x)

In [207]:
y

[[1, 2, 3], [4, 2, 3], [4, 2, 3]]

In [237]:
obs = env.observation_space.sample()
obs

array([-5.3292841e-01,  1.3936392e+38,  2.2844117e-02,  8.5078421e+37],
      dtype=float32)

In [148]:
net(torch.FloatTensor(obs))

tensor([79045296309277814133001641918405279744.,
         8074543887068950992878338433657864192.], grad_fn=<ThAddBackward>)

In [169]:
c2 = Categorical(net(torch.FloatTensor(obs)))
c2.sample()

tensor(0)

In [173]:
c2.log_prob(torch.as_tensor(1, dtype = torch.int32))

tensor(-2.3786, grad_fn=<SqueezeBackward1>)