In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as f
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as op
import gym
import torch.nn.utils as nnutils

In [0]:
!pip install ptan



In [0]:
import ptan

In [0]:
gamma=0.99
lr=0.001
entropybeta=0.01
batchsize=128
numenvs=50
rewardsteps=4
clip=0.1
rewardsteps=4

In [0]:
env=gym.make("PongNoFrameskip-v4")

In [0]:
class scaleobs(gym.ObservationWrapper):
  def __init__(self,env):
    super().__init__(env)
    self.env=env
  
  def observation(self,x):
    return np.array(x).astype(np.float32)/255.0

In [0]:
obsdim=[4, 84, 84]
acdim=env.action_space.n

In [0]:
class model(nn.Module):
  def __init__(self,obsdim,acdim):
    super().__init__()
    self.obsdim=obsdim
    self.cnn1=nn.Conv2d(self.obsdim[0],32,8,stride=4)
    self.cnn2=nn.Conv2d(32,64,4,stride=2)
    self.cnn3=nn.Conv2d(64,128,4,stride=2)
    outdim=self.getdim()
    self.fc1=nn.Linear(outdim,128)
    self.fc2=nn.Linear(128,acdim)
    self.fc3=nn.Linear(128,1)

  def forward(self,x):
    x=x.float()
    out=f.leaky_relu(self.cnn1(x))
    out=f.leaky_relu(self.cnn2(out))
    out=f.leaky_relu(self.cnn3(out))
    out=nn.Flatten(1,-1)(out)
    out=f.leaky_relu(self.fc1(out))
    return self.fc2(out),self.fc3(out)    # Actionadvantage , Value of state

  def getdim(self):
    zer=torch.zeros(1,*self.obsdim)
    out=self.cnn3(self.cnn2(self.cnn1(zer)))
    return out.shape[1]*out.shape[2]*out.shape[3]

In [0]:
env=ptan.common.wrappers.wrap_dqn(env)



In [0]:
net=model(obsdim,acdim)

In [0]:
a=torch.randn((2,4,84,84))
t=net(a)

In [0]:
t[1][:,0].shape

torch.Size([2])

In [0]:
def unpackbatch(batch,net,device):
  states=[]
  actions=[]
  rewards=[]
  laststate=[]
  notdoneidx=[]
  for idx,e in enumerate(batch):
    states.append(np.array(e.state,copy=False))
    actions.append(e.action)
    rewards.append(e.reward)

    if laststate is not None:
      notdoneidx.append(idx)
      laststate.append(np.array(e.last_state,copy=False))
  
  batchst=torch.FloatTensor(states)
  batchac=torch.LongTensor(actions)
  rewardarr=np.array(rewards,dtype=np.float32)

  if notdoneidx:
    batchls=torch.FloatTensor(laststate)
    laststate_v=net(batchls)[1]   # Givens the 
    laststate_np=laststate_v.data.cpu().numpy()[:,0]
    rewardarr[notdoneidx]+=(gamma**rewardsteps)*laststate_np# using bellmans equation converting rewardarr to Qs=r+(gamma^power)*Qsnext

  ref_vals_v=torch.FloatTensor(rewardarr)
  return batchst,batchac,ref_vals_v
# net is giving out Action Advantage values and value of state ,using that to get value of starting state using bellmans equation

In [0]:
makeenv=lambda:scaleobs(ptan.common.wrappers.wrap_dqn(gym.make("PongNoFrameskip-v4")))
envs=[makeenv() for _ in range(numenvs)]



In [0]:
agent=ptan.agent.PolicyAgent(lambda x:net(x)[0],apply_softmax=True)

In [0]:
#inside agent it needs a function

In [0]:
expsource=ptan.experience.ExperienceSourceFirstLast(envs,agent,gamma,steps_count=rewardsteps)

In [0]:
optimizer=op.Adam(net.parameters(),lr=0.0001)

In [0]:
batchsize=128
entropybeta=0.01
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip=0.1

In [0]:
adsa

NameError: ignored

In [0]:
batch=[]
for e in expsource:
  batch.append(e)
  newrewards=expsource.pop_total_rewards()
  if len(batch)<batchsize:
    continue
  
  states,actionsv,refv=unpackbatch(batch,net,device)
  batch.clear()

  optimizer.zero_grad()
  logits,val=net(states)
  lossvalues=nn.MSELoss(val.unsqueeze(-1),refv)

  logprobv=f.log_softmax(logits,dim=1)
  adv_v=refv-val.detach()
  logprobactionsv=adv_v*logprobv[range(batchsize),actionsv]
  losspolicyv=logprobactionsv.mean()

  prob=f.softmax(logits,dim=1)
  entropyv=entropybeta*((prob*logprobv).sum(dim=1).mean())

  losspolicyv.backward(retain_graph=True)

  lossv=entropyv+lossvalues
  lossv.backward()

  nnutils.clip_grad_(net.parameters(),clip)
  optimizer.step()
