In [1]:
import numpy as np
import torch
import copy
import torch.nn as nn
import torch.nn.functional as F

class ReplayBuffer(object):
	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0

		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))

		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done

		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)


	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)

		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.not_done[ind]).to(self.device)
		)
	

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
print(torch.version.cuda)
print(device)

#Defining the Critic Network
class CriticNetwork(nn.Module):
	def __init__(self, state_dim, action_dim,hidden_layer_dim1=400,hidden_layer_dim2=300,learning_rate=1e-3):
		super(CriticNetwork, self).__init__()

		#Setting the dimensions of hidden layers. Paper default is 400 for layer 1 and 300 for layer 2
		self.hidden_layer1 = hidden_layer_dim1
		self.hidden_layer2 = hidden_layer_dim2

		#Depends on the environment
		self.state_dim = state_dim
		self.action_dim = action_dim

		#Q1 architecture
		self.Q1_fc1 = nn.Linear(self.state_dim + self.action_dim, self.hidden_layer1).to(device)
		self.Q1_fc2 = nn.Linear(self.hidden_layer1, self.hidden_layer2).to(device)
		self.Q1_out = nn.Linear(self.hidden_layer2, 1).to(device)

		# Q2 architecture
		self.Q2_fc1 = nn.Linear(self.state_dim + self.action_dim, self.hidden_layer1).to(device)
		self.Q2_fc2 = nn.Linear(self.hidden_layer1, self.hidden_layer2).to(device)
		self.Q2_out = nn.Linear(self.hidden_layer2, 1).to(device)

		self.critic_optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)


	def forward(self, state, action):
		sa = torch.cat([state, action], 1).to(device)

		q1 = F.relu(self.Q1_fc1(sa)).to(device)
		q1 = F.relu(self.Q1_fc2(q1)).to(device)
		q1 = self.Q1_out(q1).to(device)

		q2 = F.relu(self.Q2_fc1(sa)).to(device)
		q2 = F.relu(self.Q2_fc2(q2)).to(device)
		q2 = self.Q2_out(q2).to(device)
		return q1, q2


	def Q1(self, state, action):
		q1 = F.relu(self.Q1_fc1(torch.cat([state, action], 1)))
		q1 = F.relu(self.Q1_fc2(q1))
		q1 = self.Q1_out(q1)
		return q1

#Defining the Actor Network
class ActorNetwork(nn.Module):
	def __init__(self, state_dim, action_dim, max_action, hidden_layer_dim1=400,hidden_layer_dim2=300,learning_rate=1e-3):
		super(ActorNetwork, self).__init__()

		#Setting the dimensions of hidden layers. Paper default is 400 for layer 1 and 300 for layer 2
		self.hidden_layer1 = hidden_layer_dim1
		self.hidden_layer2 = hidden_layer_dim2
		self.max_action = max_action

		#Depends on the environment
		self.state_dim = state_dim
		self.action_dim = action_dim

		#Defining the network
		self.fc1 = nn.Linear(self.state_dim, self.hidden_layer1).to(device)
		self.fc2 = nn.Linear(self.hidden_layer1, self.hidden_layer2).to(device)
		self.act_out = nn.Linear(self.hidden_layer2, self.action_dim).to(device)

		#Defining the optimizer
		self.actor_optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
		

	def forward(self, state):
		x = F.relu(self.fc1(state)).to(device)
		x = F.relu(self.fc2(x)).to(device)
		x = torch.tanh(self.act_out(x)).to(device)
		x = (self.max_action * x).to(device)
		return x
	

class TD3_Agent(object):
	def __init__(
		self,
		state_dim,
		action_dim,
		max_action,
		hidden_layer_dim1 = 400,
		hidden_layer_dim2 = 300,
		learning_rate = 1e-3,
		discount=0.99,
		tau=0.005,
		policy_noise=0.2,
		noise_clip=0.5,
		policy_freq=2
	):

		self.actor = ActorNetwork(state_dim, action_dim, max_action,hidden_layer_dim1,hidden_layer_dim2,learning_rate).to(device)
		self.actor_target = copy.deepcopy(self.actor)

		self.critic = CriticNetwork(state_dim, action_dim,hidden_layer_dim1,hidden_layer_dim2,learning_rate).to(device)
		self.critic_target = copy.deepcopy(self.critic)

		self.max_action = max_action
		self.discount = discount
		self.tau = tau
		self.policy_noise = policy_noise
		self.noise_clip = noise_clip
		self.policy_freq = policy_freq

		self.total_it = 0


	def select_action(self, state):
		state = torch.FloatTensor(state.reshape(1, -1)).to(device)
		return self.actor(state).cpu().data.numpy().flatten()


	def train(self, replay_buffer, batch_size=256):
		self.total_it += 1

		# Sample replay buffer 
		state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

		with torch.no_grad():
			# Select action according to policy and add clipped noise
			noise = (
				torch.randn_like(action) * self.policy_noise
			).clamp(-self.noise_clip, self.noise_clip)
			
			next_action = (
				self.actor_target(next_state) + noise
			).clamp(-self.max_action, self.max_action)

			# Compute the target Q value
			target_Q1, target_Q2 = self.critic_target(next_state, next_action)
			target_Q = torch.min(target_Q1, target_Q2)
			target_Q = reward + not_done * self.discount * target_Q

		# Get current Q estimates
		current_Q1, current_Q2 = self.critic(state, action)

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

		# Optimize the critic
		self.critic.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic.critic_optimizer.step()

		# Delayed policy updates
		if self.total_it % self.policy_freq == 0:

			# Compute actor losse
			actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
			
			# Optimize the actor 
			self.actor.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor.actor_optimizer.step()

			# Update the frozen target models
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


True
11.8
cuda


In [None]:
import gymnasium as gym
import numpy as np
import torch
import os
from torch.utils.tensorboard import SummaryWriter

if __name__ == "__main__":
    # Define environment name
    env_name = "HumanoidStandup-v5"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Default values
    seed = 0
    start_timesteps = int(10e3)
    eval_freq = int(5e3)
    max_timesteps = int(1e6)
    expl_noise = 0.1
    batch_size = 256
    discount = 0.99
    tau = 0.005
    policy_noise = 0.2
    noise_clip = 0.5
    policy_freq = 2
    save_model = False

    # Initialize environment
    env = gym.make(env_name)
    env.reset(seed=seed)  # Set the seed during environment reset
    env.action_space.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    if not os.path.exists("./runs"):
        os.makedirs("./runs")
    
    # Initialize TensorBoard writer
    writer = SummaryWriter(f"runs/{env_name}_TD3")
    # Initialize TD3 policy
    policy = TD3_Agent(state_dim=state_dim,action_dim=action_dim,max_action=max_action,hidden_layer_dim1 = 400,hidden_layer_dim2 = 300,
		learning_rate = 1e-3,
        discount=discount,
        tau=tau,
        policy_noise=policy_noise * max_action,
        noise_clip=noise_clip * max_action,
        policy_freq=policy_freq
    )

    avg_rewards = []

    # Initialize variables for training
    state, done,trunc = env.reset()[0], False,False  # Adjust reset output for gymnasium
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    replay_buffer = ReplayBuffer(state_dim, action_dim)

    # Training loop
    for t in range(max_timesteps):
        episode_timesteps += 1

        # Select action
        if t < start_timesteps:
            action = env.action_space.sample()
        else:
            action = (
                policy.select_action(np.array(state))
                + np.random.normal(0, max_action * expl_noise, size=action_dim)
            ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, trunc, _ = env.step(action)
        #print(next_state)
        done_bool = float(done or trunc) if episode_timesteps < env.spec.max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train policy
        if t >= start_timesteps:
            #torch.cuda.synchronize()
            policy.train(replay_buffer, batch_size)
            #torch.cuda.synchronize()
        
        writer.add_scalar("Reward/Timestep", episode_reward, t)
        # print("Allocated:",torch.cuda.memory_allocated())
        # print("Reserved",torch.cuda.memory_reserved())
        
        if done or trunc:
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            avg_rewards.append(episode_reward)
            state, done, trunc = env.reset()[0], False, False  # Adjust reset output for gymnasium
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

            if episode_num %5 == 0:
                writer.add_scalar("Reward/Episode", sum(avg_rewards)/len(avg_rewards), t)
                avg_rewards = []
    writer.close()

Total T: 1000 Episode Num: 1 Episode T: 1000 Reward: 32647.174
Total T: 2000 Episode Num: 2 Episode T: 1000 Reward: 30947.641
Total T: 3000 Episode Num: 3 Episode T: 1000 Reward: 33592.958
Total T: 4000 Episode Num: 4 Episode T: 1000 Reward: 33440.756
Total T: 5000 Episode Num: 5 Episode T: 1000 Reward: 32164.892
Total T: 6000 Episode Num: 6 Episode T: 1000 Reward: 30659.192
Total T: 7000 Episode Num: 7 Episode T: 1000 Reward: 32128.532
Total T: 8000 Episode Num: 8 Episode T: 1000 Reward: 37270.897
Total T: 9000 Episode Num: 9 Episode T: 1000 Reward: 32652.963
Total T: 10000 Episode Num: 10 Episode T: 1000 Reward: 32275.133
Total T: 11000 Episode Num: 11 Episode T: 1000 Reward: 70304.228
Total T: 12000 Episode Num: 12 Episode T: 1000 Reward: 66152.244
Total T: 13000 Episode Num: 13 Episode T: 1000 Reward: 63352.792
Total T: 14000 Episode Num: 14 Episode T: 1000 Reward: 49723.965
Total T: 15000 Episode Num: 15 Episode T: 1000 Reward: 51157.640
Total T: 16000 Episode Num: 16 Episode T: 1

In [None]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import os

def view_policy_and_save_video(policy, env_name, seed, video_dir="./videos/humanoid", max_steps=1000):
    # Ensure the video directory exists
    os.makedirs(video_dir, exist_ok=True)

    # Create the environment with video recording
    env = gym.make(env_name, render_mode="rgb_array")  # Use "rgb_array" mode for video
    env = RecordVideo(env, video_folder=video_dir, episode_trigger=lambda ep: True)  # Record every episode
    env.reset(seed=seed)  # Set the seed for reproducibility

    state, done = env.reset()[0], False
    total_reward = 0
    step = 0

    while not done and step < max_steps:
        # Select action using the trained policy
        action = policy.select_action(np.array(state))

        # Take the action in the environment
        state, reward, done, _, _ = env.step(action)
        total_reward += reward

        step += 1

    print(f"Total reward: {total_reward:.3f} over {step} steps")
    env.close()

    print(f"Video saved to {video_dir}")

# Call this function after training the policy
view_policy_and_save_video(policy, env_name="Hopper-v5", seed=0)
