Utils.py containing ReplayBuffer

In [None]:
import torch

import collections
import random

class ReplayBuffer():
    def __init__(self,buffer_limit,action_space,device):
        self.buffer = collections.deque(maxlen=buffer_limit)
        self.action_space = action_space
        self.device = device
    def put(self, transition):
        self.buffer.append(transition)

    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        state_lst, reward_lst, next_state_lst, done_mask_lst = [], [], [], []
        actions_lst = [[] for i in range(self.action_space)]

        for transition in mini_batch:
            state, actions,reward, next_state, done_mask = transition
            state_lst.append(state)
            for idx in range(self.action_space):
                actions_lst[idx].append(actions[idx])
            reward_lst.append([reward])
            next_state_lst.append(next_state)
            done_mask_lst.append([done_mask])
        actions_lst = [torch.tensor(x,dtype= torch.float).to(self.device) for x in actions_lst]
        return torch.tensor(state_lst, dtype=torch.float).to(self.device),\
               actions_lst ,torch.tensor(reward_lst).to(self.device),\
                torch.tensor(next_state_lst, dtype=torch.float).to(self.device),\
               torch.tensor(done_mask_lst).to(self.device)
    def size(self):
        return len(self.buffer)

network.py containing QNetwork


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class QNetwork(nn.Module):
    def __init__(self,state_space : int, action_num : int,action_scale : int):
        super(QNetwork,self).__init__()
        self.linear_1 = nn.Linear(state_space,state_space*20)
        self.linear_2 = nn.Linear(state_space*20,state_space*10)

        self.actions = [nn.Sequential(nn.Linear(state_space*10,state_space*5),
              nn.ReLU(),
              nn.Linear(state_space*5,action_scale)
              ) for _ in range(action_num)]

        self.actions = nn.ModuleList(self.actions)

        self.value = nn.Sequential(nn.Linear(state_space*10,state_space*5),
              nn.ReLU(),
              nn.Linear(state_space*5,1)
              )

    def forward(self,x):
        x = F.relu(self.linear_1(x))
        encoded = F.relu(self.linear_2(x))
        actions = [x(encoded) for x in self.actions]
        value = self.value(encoded)
        for i in range(len(actions)):
            actions[i] = actions[i] - actions[i].max(-1)[0].reshape(-1,1)
            actions[i] += value
        return actions

agent.py containing Qnetwork

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class BQN(nn.Module):
    def __init__(self,state_space : int, action_num : int,action_scale : int, learning_rate, device : str):
        super(BQN,self).__init__()

        self.q = QNetwork(state_space, action_num,action_scale).to(device)
        self.target_q = QNetwork(state_space, action_num,action_scale).to(device)
        self.target_q.load_state_dict(self.q.state_dict())

        self.optimizer = optim.Adam([\
                                    {'params' : self.q.linear_1.parameters(),'lr': learning_rate / (action_num+2)},\
                                    {'params' : self.q.linear_2.parameters(),'lr': learning_rate / (action_num+2)},\
                                    {'params' : self.q.value.parameters(), 'lr' : learning_rate/ (action_num+2)},\
                                    {'params' : self.q.actions.parameters(), 'lr' : learning_rate},\
                                    ])
        self.update_freq = 1000
        self.update_count = 0
    def action(self,x):
        return self.q(x)

    def train_mode(self,n_epi,memory,batch_size,gamma,use_tensorboard,writer):
        state, actions, reward, next_state, done_mask = memory.sample(batch_size)

        state = state.float()
        reward = reward.float()
        next_state = next_state.float()
        done_mask = done_mask.float()

        actions = torch.stack(actions).transpose(0,1).unsqueeze(-1)
        done_mask = torch.abs(done_mask-1)



        cur_actions = self.q(state)
        cur_actions = torch.stack(cur_actions).transpose(0,1)
        cur_actions = cur_actions.gather(2,actions.long()).squeeze(-1)

        target_cur_actions = self.target_q(next_state)
        target_cur_actions = torch.stack(target_cur_actions).transpose(0,1)
        target_cur_actions = target_cur_actions.max(-1,keepdim = True)[0]
        target_action = (done_mask * gamma * target_cur_actions.mean(1) + reward)

        loss = F.mse_loss(cur_actions,target_action.repeat(1,4))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_count += 1
        if (self.update_count % self.update_freq == 0) and (self.update_count > 0):
            self.update_count = 0
            self.target_q.load_state_dict(self.q.state_dict())

        if use_tensorboard:
            writer.add_scalar("Loss/loss", loss, n_epi)
        return loss

Main.py

In [None]:
!pip install swig
!pip install gym[box2d]

^C
Collecting box2d-py==2.3.5 (from gym[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pygame==2.1.0 (from gym[box2d])
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting swig==4.* (from gym[box2d])
  Downloading swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: box2d-py
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit cod

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import os
import time
import random
import argparse

import gym

# Define arguments directly in the code
class Args:
    train = True
    render = False
    epochs = 1000
    tensorboard = False
    lr_rate = 0.0001
    batch_size = 64
    gamma = 0.99
    action_scale = 6
    load = 'no'
    save_interval = 100
    print_interval = 1

args = Args()

use_tensorboard = args.tensorboard
action_scale = args.action_scale
learning_rate = args.lr_rate
batch_size = args.batch_size
gamma = args.gamma

if use_tensorboard :
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter()
else:
    writer = None
os.makedirs('./model_weights', exist_ok=True)


env = gym.make("BipedalWalker-v3")
state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]
print('observation space : ', env.observation_space)
print('action space : ', env.action_space)
print(env.action_space.low, env.action_space.high)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    agent = BQN(state_space,action_space,(action_scale), learning_rate, device).cuda()
else :
    agent = BQN(state_space,action_space,(action_scale), learning_rate, device)
if args.load != 'no':
    agent.load_state_dict(torch.load('./model_weights/'+args.load))
memory = ReplayBuffer(100000,action_space,device)
real_action = np.linspace(-1.,1., action_scale)

for n_epi in range(2000):
    state = env.reset()
    done = False
    score = 0.0
    while not done:
        if args.render :
            env.render()
        epsilon = max(0.01, 0.9 - 0.01*(n_epi/10))
        if epsilon > random.random():
            action = random.sample(range(0,(action_scale)),4)
        else:
            action_prob = agent.action(torch.tensor(state).float().reshape(1,-1).to(device))
            action = [int(x.max(1)[1]) for x in action_prob]
        next_state, reward, done, info = env.step(np.array([real_action[x] for x in action]))

        score += reward
        done = 0 if done == False else 1
        memory.put((state,action,reward,next_state, done))
        if (memory.size()>5000) and (args.train):
            agent.train_mode(n_epi, memory, batch_size, gamma, use_tensorboard,writer)
        state = next_state
    if use_tensorboard:
        writer.add_scalar("reward", score, n_epi)
    if (n_epi % args.save_interval == 0) and (n_epi > 0):
        torch.save(agent.state_dict(),'./model_weights/agent_'+str(n_epi))
    if (n_epi % args.print_interval == 0):
        print("epi : ",n_epi,", score : ",score)


DependencyNotInstalled: box2D is not installed, run `pip install gym[box2d]`