In [1]:
import torch
import gym
from gym import envs
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
# from torch.autograd import Variable
from torch.distributions import Categorical
# from torchvision import transforms
# from torch.utils.tensorboard import SummaryWriter
import concurrent.futures


In [7]:
lenobs = 100800
class ActorCritic(nn.Module):
    def __init__(self, ran):
        super(ActorCritic, self).__init__()
        torch.random.manual_seed(ran)
        self.l1 = nn.Linear(lenobs,25)
        self.l2 = nn.Linear(25,50)
        self.actor_lin1 = nn.Linear(50,6)
        self.l3 = nn.Linear(50,25)
        self.critic_lin1 = nn.Linear(25,1)

    def forward(self,x):
        x = F.normalize(x,dim=0)
        y = F.relu(self.l1(x))
        y = F.normalize(y,dim=0)
        y = F.relu(self.l2(y))
        y = F.normalize(y,dim=0)
#         actor = F.softmax(self.actor_lin1(y),dim=0)
        actor = F.log_softmax(self.actor_lin1(y),dim=0)
        c = F.relu(self.l3(y.detach()))
        critic = F.hardtanh(self.critic_lin1(c))
        return actor, critic

In [23]:
def doTrain(model, ran):  
    
    env = gym.make('Pong-v0')
    mi = model(ran)
    optimizer = optim.Adam(lr=1e-4,params=mi.parameters())
    
    values = []
    rewards = []
    logprobs = []
    observation = env.reset()
    done = False
    N = 0
    while done == False and N<10:
        N+=1
        pobservation = torch.from_numpy(observation)
        flattened_pobservation = pobservation.view(-1).float()
        policy, value = mi(flattened_pobservation)
        values.append(value.item())
        sampler = Categorical(policy)
        action = sampler.sample()
        logprobs.append(policy[action.item()].item())
        observation, reward, done, log = env.step(action.item())
        if done:
            rewards.append(1.0)
        else:
            rewards.append(reward)
        
    torch_values = torch.tensor(values, requires_grad = True).view(-1).flip(0)
    torch_rewards = torch.tensor(rewards, requires_grad = True).flip(0)
    torch_logprobs = torch.tensor(logprobs, requires_grad = True).flip(0)
    
    returns = []
    gamma = 0.90
    clc = 0.1
    ret = 0.0
    for r in torch_rewards:
        ret = r + gamma*ret
        returns.append(ret)
    returns = torch.tensor(returns, requires_grad = True)
    returns = F.normalize(returns,dim=0)
    actor_loss = -1*torch_logprobs * (returns - torch_values.detach())
    critic_loss = torch.pow(torch_values - returns,2)
    loss = actor_loss.sum() + clc*critic_loss.sum()
    optimizer.zero_grad()
    loss.backward()
    gradients = []
    
    f = open('grads.txt','w')
    for i in mi.named_parameters():
#         gradients.append(i.grad)
        try:
            f.write(str(i.grad))
        except:
            f.write('\n***NoGrad***')
    
    
    f.close()
    
    optimizer.step()
    return gradients


In [24]:
updatedParams = []
results = []


with concurrent.futures.ProcessPoolExecutor() as executor:
    for i in range(5):
        results.append(executor.submit(doTrain, ActorCritic, int((torch.randn(1)**2)*200)))
    

    for f in concurrent.futures.as_completed(results):
        updatedParams.append(f.result())


In [25]:
updatedParams

[[], [], [], [], []]

In [None]:
class mymodel(nn.Module):
    def __init__(self):
        super(mymodel, self).__init__()
        self.w1 = nn.Linear(3,2)
        
    def forward(self, X):
        out = self.w1(X)
        return out
    
    
a1 = mymodel()
op = optim.Adam(lr=1e-3, params = a1.parameters())

    
X = torch.randn(3)
out = a1(X)
op.zero_grad()
loss = out.sum()
loss.backward()
print(loss)
op.step()

In [None]:
g = []

In [None]:
for i in a1.parameters():
    g.append(i.grad)

In [2]:
def dTrain(model, X):
    mi = model()
    mi.train()
    out = mi(X)
    optimizer = optim.Adam(lr=1e-4,params=mi.parameters())
    optimizer.zero_grad()
    loss = out.sum()
    loss.backward()
    optimizer.step()
    
    gradients = []
    for i in mi.parameters():
        gradients.append(i.grad)
    
    return gradients

In [3]:
class mymodel(nn.Module):
    def __init__(self):
        super(mymodel, self).__init__()
        self.w1 = nn.Linear(3,2)
        
    def forward(self, X):
        out = self.w1(X)
        return out


    
updatedParams = []
results = []


with concurrent.futures.ProcessPoolExecutor() as executor:
    for i in range(5):
        X = torch.randn(3)
        results.append(executor.submit(dTrain, mymodel, X))
    

    for f in concurrent.futures.as_completed(results):
        updatedParams.append(f.result())


In [4]:
updatedParams[0], updatedParams[1], updatedParams[2]

([tensor([[-1.5693, -0.8370,  0.2175],
          [-1.5693, -0.8370,  0.2175]]), tensor([1., 1.])],
 [tensor([[ 1.3216, -0.2651, -0.7811],
          [ 1.3216, -0.2651, -0.7811]]), tensor([1., 1.])],
 [tensor([[-0.0011, -0.5416, -0.1722],
          [-0.0011, -0.5416, -0.1722]]), tensor([1., 1.])])