In [None]:
import gymnasium as gym
import torch
import copy
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
from torch.utils.tensorboard import SummaryWriter

In [None]:

class ReplayBuffer(object):
	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0
		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done
		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)


	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)
		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.not_done[ind]).to(self.device)
		)
	

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
print(torch.version.cuda)
print(device)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 256).to(device)
        self.l2 = nn.Linear(256, 256).to(device)
        self.mean = nn.Linear(256, action_dim).to(device)
        self.log_std = nn.Linear(256, action_dim).to(device)
        self.max_action = max_action

    def forward(self, state):
        x = F.relu(self.l1(state)).to(device)
        x = F.relu(self.l2(x)).to(device)
        mean = self.mean(x)
        log_std = torch.clamp(self.log_std(x), -20, 2).to(device)  
        return mean, log_std

    def sample_action(self, state):
        mean, log_std = self.forward(state)
        std = log_std.exp() 
        normal = torch.distributions.Normal(mean, std)
        action = normal.rsample()  
        log_prob = normal.log_prob(action).sum(dim=-1)  
        action = torch.tanh(action).to(device)
        log_prob -= torch.log(1 - action.pow(2) + 1e-6).sum(dim=-1).to(device)  
        action = action * self.max_action
        return action, log_prob

class Critic(nn.Module):
	def __init__(self, state_dim, action_dim):
		super(Critic, self).__init__()
		self.l1 = nn.Linear(state_dim + action_dim, 256).to(device)
		self.l2 = nn.Linear(256, 256).to(device)
		self.l3 = nn.Linear(256, 1).to(device)
		self.l4 = nn.Linear(state_dim + action_dim, 256).to(device)
		self.l5 = nn.Linear(256, 256).to(device)
		self.l6 = nn.Linear(256, 1).to(device)

	def forward(self, state, action):
		sa = torch.cat([state, action], 1).to(device)
		q1 = F.relu(self.l1(sa)).to(device)
		q1 = F.relu(self.l2(q1)).to(device)
		q1 = self.l3(q1)
		q2 = F.relu(self.l4(sa)).to(device)
		q2 = F.relu(self.l5(q2)).to(device)
		q2 = self.l6(q2)
		return q1, q2


	def Q1(self, state, action):
		sa = torch.cat([state, action], 1).to(device)
		q1 = F.relu(self.l1(sa)).to(device)
		q1 = F.relu(self.l2(q1)).to(device)
		q1 = self.l3(q1)
		return q1


class SAC(object):
    def __init__(
        self,
        state_dim,
        action_dim,
        max_action,
        discount=0.99,
        tau=0.005,
        alpha=0.2,
        target_entropy=None,
        actor_lr=3e-4,
        critic_lr=3e-4,
        alpha_lr=3e-4,
    ):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.log_alpha = torch.tensor(np.log(alpha), requires_grad=True, device=device)
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr)
        self.target_entropy = target_entropy if target_entropy is not None else -action_dim
        self.max_action = max_action
        self.discount = discount
        self.tau = tau

    @property
    def alpha(self):
        return self.log_alpha.exp()
    def select_action(self, state, deterministic=False):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        if deterministic:
            action, _ = self.actor(state) 
            return action.cpu().data.numpy().flatten()
        else:
            action, _ = self.actor.sample_action(state)
            return action.cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=256):
        
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)
        with torch.no_grad():
            next_action, log_prob = self.actor.sample_action(next_state)
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2) - self.alpha * log_prob
            target_Q = reward + not_done * self.discount * target_Q

        current_Q1, current_Q2 = self.critic(state, action)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        action, log_prob = self.actor.sample_action(state)
        q1, q2 = self.critic(state, action)
        actor_loss = (self.alpha * log_prob - torch.min(q1, q2)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()

        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)



True
11.8
cuda


In [2]:
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 4060 Laptop GPU


In [None]:
if __name__ == "__main__":
    
    env_name = "BipedalWalker-v3"
    seed = 0
    start_timesteps = int(10e3)
    eval_freq = int(5e3)
    max_timesteps = int(1e6)
    batch_size = 256
    discount = 0.99
    tau = 0.005
    alpha = 0.2  
    save_model = True
    env = gym.make(env_name)
    env.reset(seed=seed)  
    env.action_space.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    if not os.path.exists("./runs"):
        os.makedirs("./runs")
    
    writer = SummaryWriter(f"runs/{env_name}_SAC")

    policy = SAC(
        state_dim=state_dim,
        action_dim=action_dim,
        max_action=max_action,
        discount=discount,
        tau=tau,
        alpha=alpha
    )
    avg_rewards = []

    state, done, trunc = env.reset()[0], False, False  
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    replay_buffer = ReplayBuffer(state_dim, action_dim)

    for t in range(max_timesteps):
        episode_timesteps += 1

        if t < start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.select_action(np.array(state))  
        next_state, reward, done, trunc, _ = env.step(action)
        done_bool = float(done or trunc) if episode_timesteps < env.spec.max_episode_steps else 0

        replay_buffer.add(state, action, next_state, reward, done_bool)
        state = next_state
        episode_reward += reward

        if t >= start_timesteps:
            policy.train(replay_buffer, batch_size)

        writer.add_scalar("Reward/Timestep", episode_reward, t)    

        if done or trunc:
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            avg_rewards.append(episode_reward)
            state, done, trunc = env.reset()[0], False, False 
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

            if episode_num %5 == 0:
                writer.add_scalar("Reward/Episode", sum(avg_rewards)/len(avg_rewards), t)
                avg_rewards = []
    writer.close()            


Total T: 59 Episode Num: 1 Episode T: 59 Reward: -108.465
Total T: 124 Episode Num: 2 Episode T: 65 Reward: -113.457
Total T: 1724 Episode Num: 3 Episode T: 1600 Reward: -82.656
Total T: 1837 Episode Num: 4 Episode T: 113 Reward: -106.184
Total T: 1927 Episode Num: 5 Episode T: 90 Reward: -116.472
Total T: 3527 Episode Num: 6 Episode T: 1600 Reward: -80.395
Total T: 3581 Episode Num: 7 Episode T: 54 Reward: -106.627
Total T: 3696 Episode Num: 8 Episode T: 115 Reward: -99.927
Total T: 3787 Episode Num: 9 Episode T: 91 Reward: -96.982
Total T: 3846 Episode Num: 10 Episode T: 59 Reward: -103.279
Total T: 3894 Episode Num: 11 Episode T: 48 Reward: -110.402
Total T: 5494 Episode Num: 12 Episode T: 1600 Reward: -73.905
Total T: 5579 Episode Num: 13 Episode T: 85 Reward: -99.266
Total T: 5655 Episode Num: 14 Episode T: 76 Reward: -105.410
Total T: 5705 Episode Num: 15 Episode T: 50 Reward: -111.717
Total T: 5776 Episode Num: 16 Episode T: 71 Reward: -102.234
Total T: 7376 Episode Num: 17 Epis

  critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)


Total T: 10772 Episode Num: 22 Episode T: 1600 Reward: -90.976
Total T: 11853 Episode Num: 23 Episode T: 1081 Reward: -183.671
Total T: 11900 Episode Num: 24 Episode T: 47 Reward: -106.839
Total T: 11950 Episode Num: 25 Episode T: 50 Reward: -106.344
Total T: 12018 Episode Num: 26 Episode T: 68 Reward: -114.177
Total T: 12069 Episode Num: 27 Episode T: 51 Reward: -104.321
Total T: 12123 Episode Num: 28 Episode T: 54 Reward: -108.301
Total T: 12229 Episode Num: 29 Episode T: 106 Reward: -117.027
Total T: 12303 Episode Num: 30 Episode T: 74 Reward: -109.014
Total T: 13903 Episode Num: 31 Episode T: 1600 Reward: -115.134
Total T: 13973 Episode Num: 32 Episode T: 70 Reward: -103.772
Total T: 14093 Episode Num: 33 Episode T: 120 Reward: -106.546
Total T: 14174 Episode Num: 34 Episode T: 81 Reward: -101.685
Total T: 14270 Episode Num: 35 Episode T: 96 Reward: -102.822
Total T: 14458 Episode Num: 36 Episode T: 188 Reward: -106.738
Total T: 14523 Episode Num: 37 Episode T: 65 Reward: -101.924
