In [0]:
import torch
import torch.nn as n
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as op
import torch.nn.functional as f
import torch.nn.utils as nutils
import gym

In [0]:
!pip install ptan

Collecting ptan
  Downloading https://files.pythonhosted.org/packages/91/cb/57f6d86625f2b24c008b0524ca29559683aa75d00afa38b6b44d7fcad25b/ptan-0.6.tar.gz
Collecting torch==1.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/50a05de5337f7a924bb8bd70c6936230642233e424d6a9747ef1cfbde353/torch-1.3.0-cp36-cp36m-manylinux1_x86_64.whl (773.1MB)
[K     |████████████████████████████████| 773.1MB 22kB/s 
Building wheels for collected packages: ptan
  Building wheel for ptan (setup.py) ... [?25l[?25hdone
  Created wheel for ptan: filename=ptan-0.6-cp36-none-any.whl size=23502 sha256=3b43308e465a88d9168c437d776cfcb7d22a88fccf9eb16b7797b8d6721f6897
  Stored in directory: /root/.cache/pip/wheels/f0/4b/2f/9a45fd39b0a614a2716bc6128a7f1adb4647f323a2d90783f2
Successfully built ptan
[31mERROR: torchvision 0.5.0 has requirement torch==1.4.0, but you'll have torch 1.3.0 which is incompatible.[0m
Installing collected packages: torch, ptan
  Found existing installation: torch 1.4.0
  

In [0]:
import ptan

In [0]:
GAMMA = 0.99
LEARNING_RATE = 0.001
ENTROPY_BETA = 0.01
BATCH_SIZE = 128
NUM_ENVS = 50

REWARD_STEPS = 4
CLIP_GRAD = 0.1

In [0]:
class net(n.Module):
  def __init__(self,obsshape,naction):
    super().__init__()
    self.cnn1=n.Conv2d(obsshape[0],32,kernel_size=8,stride=4)
    self.cnn2=n.Conv2d(32,64,kernel_size=8,stride=4)
    self.cnn3=n.Conv2d(64,128,kernel_size=4,stride=2)
    dim1=self.outshape(obsshape)
    self.fc1=n.Linear(dim1,128)
    self.fc2=n.Linear(128,naction)
    self.fc3=n.Linear(dim1,128)
    self.fc4=n.Linear(128,1)

  def forward(self,x):
    out=f.relu(self.cnn1(x))
    out=f.relu(self.cnn2(out))
    out=f.relu(self.cnn3(out))
    out1=n.Flatten(start_dim=1,end_dim=-1)(out)
    out2=f.relu(self.fc1(out1))
    out2=self.fc2(out2)

    out3=f.relu(self.fc3(out1))
    out3=self.fc3(out3)
    return out2,out3
  
  def outshape(self,obsshape):
    z=torch.zeros(1,*obsshape)
    out=self.cnn3(self.cnn2(self.cnn1(z)))
    return out.shape[1]*out.shape[2]*out.shape[3]

In [0]:
def unpackbatch(batch,net,skipsteps):
  batchstates=[]
  batchactions=[]
  batchrewards=[]
  batchlaststates=[]
  notidx=[]
  count=0
  for e in batch:
    batchstates.append(np.array(e.state,copy=False))
    batchactions.append(e.action)
    batchrewards.append(e.reward)
    if e.last_state is not None:
      notidx.append(count)
      batchlaststates.append(np.array(e.last_state,copy=False)) # copy=False doesnt copy array into new memory location use same

    count+=1
  
  statesv=torch.FloatTensor(batchstates)
  actionsv=torch.LongTensor(batchactions)
  rewardsv=np.array(batchrewards,copy=False)
  laststate=torch.FloatTensor(batchlaststates)

  if notidx:
    batchlstv=net.forward(laststate)[1]
    batchlstv=batchlstv.data.cpu().numpy()[:,0]
    rewardsv[notidx]+=(gamma**skipsteps)*batchlstv
  
  rewardstv=torch.FloatTensor(rewardsv)

  return statesv,actionsv,rewardstv

In [0]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
class scaledframe(gym.ObservationWrapper):
  def __init_(self,env):
    super().__init__(env)
    self.env=env
  
  def observation(self,x):
    return np.array(x).astype(np.float32)/255

In [0]:
makeenv=lambda : scaledframe(ptan.common.wrappers.wrap_dqn(gym.make("PongNoFrameskip-v4")))

In [0]:
gamma=0.99
skipsteps=10
batchsize=128


In [0]:
numenvs=10
model=net([4,84,84],6)

In [0]:
envs=[makeenv() for _ in range(numenvs)]



In [0]:
agent=ptan.agent.PolicyAgent(lambda x:model.forward(x)[0],apply_softmax=True,preprocessor=ptan.agent.float32_preprocessor)

In [0]:
expsource=ptan.experience.ExperienceSourceFirstLast(envs,agent,gamma=gamma,steps_count=skipsteps)

In [0]:
opt=op.Adam(model.parameters(),lr=0.001)

In [0]:
batchsize=128
entropybeta=0.01
clipgrad=0.1

In [0]:
batch=[]
for e in expsource:
  batch.append(e)

  if len(batch)<batchsize:
    continue
  
  batchs,batcha,batchr=unpackbatch(batch,model,skipsteps)
  batch.clear()

  opt.zero_grad()
  logits,vals=model.forward(batchs)
  lossvaluesv=f.mse_loss(vals.squeeze(-1),batchr)

  logprobv=f.log_softmax(logits,dim=1)
  adv=batchr-vals     # Q=r+gamma*V   # very important batchr represent actionvalue , vals represent value of state
  logprobaction_v=adv*logprobv[range(batchsize),batcha]
  losspolicyv=logprobaction_v.mean()

  prob=f.softmax(logits,dim=1)
  entropyloss=-entropybeta*((prob*logprobv).sum(dim=1)).mean()

  losspolicyv.backward(retain_graph=True)

  totalloss=lossvaluesv+entropyloss
  totalloss.backward()

  nutils.clip_grad_norm_(model.parameters(),clipgrad)
  opt.step()
  print(totalloss.item())


  del sys.path[0]


0.4140785336494446
0.02603548765182495
0.3954366147518158
0.4624864459037781
0.3866102397441864
1.4562320709228516
1.9308923482894897
4.245213985443115
8.219496726989746
12.11728572845459
20.38503646850586
26.304241180419922
33.56495666503906
43.41205596923828
48.45710754394531
51.72475051879883
51.77461624145508
48.644840240478516
45.16239929199219
44.94396209716797
53.738765716552734
60.512046813964844
73.09741973876953
79.88333129882812
71.29640197753906
63.684146881103516
47.45795822143555
29.648761749267578
21.83549690246582
12.172743797302246
5.974066257476807
3.566744804382324
1.1588586568832397
0.587730884552002
0.30282923579216003
0.41609227657318115
0.24525751173496246
0.1498563587665558
0.34076976776123047
0.09283381700515747
0.3043673038482666
0.37143757939338684
0.07975877821445465
0.5815410017967224
0.4675080180168152
0.29553452134132385
1.1308178901672363
1.0875436067581177
1.8001641035079956
3.5593910217285156
4.7494330406188965
8.958335876464844
13.179726600646973
19.0

KeyboardInterrupt: ignored