In [1]:
import torch
import gym
from gym import envs
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.autograd import Variable
from torch.distributions import Categorical
from torchvision import transforms
from torch.utils.tensorboard import SummaryWriter

In [None]:
lenobs = 100800
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.l1 = nn.Linear(lenobs,25)
        self.l2 = nn.Linear(25,50)
        self.actor_lin1 = nn.Linear(50,2)
        self.l3 = nn.Linear(50,25)
        self.critic_lin1 = nn.Linear(25,1)

    def forward(self,x):
        x = F.normalize(x,dim=0)
        y = F.relu(self.l1(x))
        y = F.normalize(y,dim=0)
        y = F.relu(self.l2(y))
        y = F.normalize(y,dim=0)
#         actor = F.softmax(self.actor_lin1(y),dim=0)
        actor = F.log_softmax(self.actor_lin1(y),dim=0)
        c = F.relu(self.l3(y.detach()))
        critic = torch.tanh(self.critic_lin1(c))
        return actor, critic

In [None]:
tb = SummaryWriter()
env = gym.make('PongNoFrameskip-v0')
moveMapping = {
    0:2,
    1:3
}

model = ActorCritic()
optimizer = optim.Adam(lr=1e-4,params=model.parameters())

model.train()
'''
loss = Variable(loss, requires_grad = True)
actor_loss = Variable(actor_loss, requires_grad = True)
critic_loss = Variable(critic_loss, requires_grad = True)
'''

for i_episode in range(20000):
    print('Epoch {}'.format(i_episode))
#     reward = 0.0
    values = []
    rewards = []
    logprobs = []
    observation = env.reset()
    print('---------------')
    done = False
    N = 0
    while done == False and N<10:
        N+=1
#         print(t)
        pobservation = torch.from_numpy(observation)
        flattened_pobservation = pobservation.view(-1).float()
        policy, value = model(flattened_pobservation)
#         print('Policy:{}'.format(policy))
        values.append(value.item())
#         print('Values:{}'.format(values))
        sampler = Categorical(policy)
        action = sampler.sample()
#         print(action.item())
#         action = np.random.choice(np.array([0,1]), p = policy.view(2,).data.numpy())
        logprobs.append(policy[action.item()].item())
#         print('Logprobs: {}'.format(logprobs))
#         print('Action: {}'.format('right' if action==2 else 'left'))
        observation, reward, done, log = env.step(moveMapping[action.item()])
#         rewards.append(reward)
#         print('rewards: {}'.format(rewards))
#         print('---')
        if done:
            rewards.append(-1.0)
        else:
            rewards.append(1.0)
        
      
    # Reversing because earlier actions need to be discounted
    torch_values = torch.tensor(values, requires_grad = True).view(-1).flip(0)
    torch_rewards = torch.tensor(rewards, requires_grad = True).flip(0)
    torch_logprobs = torch.tensor(logprobs, requires_grad = True).flip(0)
#     print((torch_logprobs<0).sum()/torch_logprobs.shape[0])
    

    
    
    returns = []
    gamma = 0.90
    clc = 0.1
    ret = torch.tensor([0])
    for r in torch_rewards:
        ret = r + gamma*ret
        returns.append(ret)
    returns = torch.tensor(returns, requires_grad = True)
#     print(returns.shape)
    returns = F.normalize(returns,dim=0)
#     print('returns.mean():{} returns.std():{}'.format(returns.mean(), returns.std()))
    actor_loss = -1*torch_logprobs * (returns - torch_values.detach())
    critic_loss = torch.pow(torch_values - returns,2)
    loss = actor_loss.sum() + clc*critic_loss.sum()
    tb.add_scalar('Loss',loss,i_episode)
#     print('Loss: {}'.format(loss))
#     print('Starting Backpropagation')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print('Completed Training')

In [None]:
import time
observation = env.reset()
model.eval()
done = False
a = time.time()
while done == False:
    pobservation = torch.from_numpy(observation)
    flattened_pobservation = pobservation.view(-1).float()
    policy, value = model(flattened_pobservation)
    sampler = Categorical(policy)
    action = sampler.sample()
    observation, reward, done, log = env.step(moveMapping[action.item()])
    env.render()
    
if done:
    env.close()
    
b = time.time()-a
print(b)

In [None]:
m1 = ActorCritic()
observation = env.reset()
import time
# observation = env.reset()
m1.eval()
done = False
a = time.time()
while done == False:
    pobservation = torch.from_numpy(observation)
    flattened_pobservation = pobservation.view(-1).float()
    policy, value = m1(flattened_pobservation)
    sampler = Categorical(policy)
    action = sampler.sample()
    observation, reward, done, log = env.step(moveMapping[action.item()])
    env.render()
    
if done:
    env.close()
    
b = time.time()-a
print(b)

Parallelizing training

In [7]:
import concurrent.futures
import torch
import torch.nn as nn
import torch.optim as optimizer
from torch.distributions import Categorical

class mymodel(nn.Module):
    def __init__(self):
        super(mymodel,self).__init__()
        self.weight = nn.Linear(3,2)
        
    def forward(self, X):
        out = self.weight(X)
        out = nn.Softmax(dim = 0)(out)
        return out

    
def doTrain(model, X):     
    a1 = model()
    return list(a1.parameters())

X = torch.randn(20,3)


updatedParams = []
results = []


with concurrent.futures.ProcessPoolExecutor() as executor:
    for i in range(5):
        results.append(executor.submit(doTrain, mymodel, X[i*4:(i+1)*4]))
    
    for f in concurrent.futures.as_completed(results):
        updatedParams.append(f.result())





In [None]:
 optim = optimizer.Adam(lr=1e-3, params = a1.parameters())
    for i in range(15):
        optim.zero_grad()
        out = a1(X)
        sampler = Categorical(out)
        action = sampler.sample()
        loss = 0.
        for ind in range(action.shape[0]):
            loss+=out[ind][action[ind]].item()
        loss = torch.tensor(loss, requires_grad= True)
        loss.backward()
        optim.step()

In [8]:
updatedParams

[[Parameter containing:
  tensor([[ 0.1995,  0.1247, -0.0875],
          [-0.2669, -0.3425,  0.3404]], requires_grad=True),
  Parameter containing:
  tensor([ 0.3416, -0.4114], requires_grad=True)],
 [Parameter containing:
  tensor([[ 0.1995,  0.1247, -0.0875],
          [-0.2669, -0.3425,  0.3404]], requires_grad=True),
  Parameter containing:
  tensor([ 0.3416, -0.4114], requires_grad=True)],
 [Parameter containing:
  tensor([[ 0.1995,  0.1247, -0.0875],
          [-0.2669, -0.3425,  0.3404]], requires_grad=True),
  Parameter containing:
  tensor([ 0.3416, -0.4114], requires_grad=True)],
 [Parameter containing:
  tensor([[ 0.1995,  0.1247, -0.0875],
          [-0.2669, -0.3425,  0.3404]], requires_grad=True),
  Parameter containing:
  tensor([ 0.3416, -0.4114], requires_grad=True)],
 [Parameter containing:
  tensor([[ 0.1995,  0.1247, -0.0875],
          [-0.2669, -0.3425,  0.3404]], requires_grad=True),
  Parameter containing:
  tensor([ 0.3416, -0.4114], requires_grad=True)]]

In [None]:
updatedWts = (updatedParams[0][0]+updatedParams[1][0]+updatedParams[2][0]+updatedParams[3][0]+updatedParams[4][0])/5.
updatedBias = (updatedParams[0][1]+updatedParams[1][1]+updatedParams[2][1]+updatedParams[3][1]+updatedParams[4][1])/5.0


In [None]:
updatedWts, updatedBias

In [None]:
############################ Experimentations ###############################################

In [None]:
# from torch.distributions import Categorical
# a = torch.tensor([20,20,20,20,20]).float()
# # Note that this is equivalent to what used to be called multinomial
# m = Categorical(a)

# li = [0,0,0,0,0]


# for i in range(100):
#     li[m.sample().item()] +=1
    
# li

In [None]:
# a = torch.tensor(3)
# a.item()


In [None]:
#############################################################################################

In [None]:
observation.shape