In [1]:
# !zip -r ./td3_original td3_original
# !pwd

In [2]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import time
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device)
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action, action_scale, action_add):
        super(Actor, self).__init__()
        self.action_scale = torch.FloatTensor(action_scale).to(device)
        self.action_add = torch.FloatTensor(action_add).to(device)

        self.l1 = nn.Linear(state_dim, 16)
        self.l2 = nn.Linear(16, 32)
        self.l3 = nn.Linear(32, 16)
        self.l4 = nn.Linear(16, action_dim)

        self.max_action = max_action


    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        a = F.relu(self.l3(a))
        a = self.l4(a)
        a = self.action_scale * torch.tanh(a) + self.action_add

        return a


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 16)
        self.l2 = nn.Linear(16, 32)
        self.l3 = nn.Linear(32, 16)
        self.l4 = nn.Linear(16, 1)

        # Q2 architecture
        self.l5 = nn.Linear(state_dim + action_dim, 16)
        self.l6 = nn.Linear(16, 32)
        self.l7 = nn.Linear(32, 16)
        self.l8 = nn.Linear(16, 1)


    def forward(self, state, action):
        sa = torch.cat([state, action], 1)
        # print(sa.shape)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = F.relu(self.l3(q1))
        q1 = self.l4(q1)


        q2 = F.relu(self.l5(sa))
        q2 = F.relu(self.l6(q2))
        q2 = F.relu(self.l7(q2))
        q2 = self.l8(q2)
        return q1, q2


    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = F.relu(self.l3(q1))
        q1 = self.l4(q1)
        return q1

class TD3(object):
    def __init__(
        self,
        state_dim,
        action_dim,
        max_action,
        action_scale, action_add,
        discount=0.99,
        tau=0.001,
        policy_noise=0.0,
        noise_clip=0.1,
        policy_freq=1
    ):
        self.actor = Actor(state_dim, action_dim, max_action, action_scale, action_add).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=1e-3)

        self.max_action = max_action
        self.discount = discount
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.total_it = 0


    def select_action(self, state):
        state = torch.FloatTensor(state).to(device)
        action = self.actor(state).cpu().data.numpy()
        return action


    def train(self, replay_buffer, batch_size=256):
        self.total_it += 1

        # Sample replay buffer
        # tic = time.time()
        batch = random.sample(replay_buffer, batch_size)
        state, action, reward, next_state, not_done = zip(*batch)
        state = torch.FloatTensor(state).to(device)
        action = torch.FloatTensor(action).to(device)
        reward = torch.FloatTensor(reward).to(device)
        not_done = torch.FloatTensor(not_done).to(device)
        not_done = not_done.reshape([not_done.shape[0],1])
        next_state = torch.FloatTensor(next_state).to(device)
        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (
                torch.randn_like(action) * self.policy_noise
            ).clamp(-self.noise_clip, self.noise_clip)

            next_action = (
                self.actor_target(next_state) + noise
            ).clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward.reshape(target_Q.shape) + not_done * self.discount * target_Q
        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            # Compute actor losse
            actions = self.actor(state)
            actor_loss = -self.critic.Q1(state, actions).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            return actor_loss.item(), critic_loss.item()/len(batch)
        return 0, critic_loss.item()/len(batch)

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic.pt")
        torch.save(self.actor.state_dict(), filename + "_actor.pt")


    def load(self, filename):
        self.critic.load_state_dict(torch.load(filename + "_critic.pt",map_location=torch.device('cpu')))
        self.critic_target = copy.deepcopy(self.critic)

        self.actor.load_state_dict(torch.load(filename + "_actor.pt", map_location=torch.device('cpu')))
        self.actor_target = copy.deepcopy(self.actor)


cpu


In [3]:
import numpy as np
import torch
from collections import deque
from copy import deepcopy
# from TD3 import *

# device = torch.device('cuda')
device = torch.device('cpu')

class Agent():
    def __init__(self, state_dim, action_dim, max_action = 2, action_scale = [2], action_add=[0], batch_size = 4096) -> None:
        self.TD = TD3(state_dim, action_dim, max_action, action_scale, action_add)
        self.steps_done = 0
        self.memory = deque(maxlen=100000)
        self.batch_size = batch_size

    def memorize(self, state, action, reward, next_state, not_done):
        self.memory.append([state, action, reward, next_state, not_done])

    def learn(self):
        if len(self.memory) < self.batch_size:
            return 0, 0
        err_actor, err_critic = self.TD.train(self.memory, self.batch_size)
        return err_actor, err_critic

    def act(self, state):
        action = self.TD.select_action(state)
        return action

In [4]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# from Agent import Agent
import gym
import numpy as np
import matplotlib.pyplot as plt
import wandb
import os

class SimpleRunner:
    def __init__(self, num_episodes=10000, learn_every=50, noise_variance=0.1, viz=False, test_every=10) -> None:
        self.env = gym.make('Pendulum-v1', new_step_api=True)
        self.num_episodes = num_episodes
        self.agent = Agent(self.env.observation_space.shape[0], self.env.action_space.shape[0])
        self.learn_every = learn_every
        self.learn_every=learn_every
        self.noise_variance = noise_variance
        self.viz = viz
        self.test_every = test_every
        self.train_target_state = np.array([np.cos(0.0 * np.pi/180), np.sin(0.0 * np.pi/180), 0.0])
        self.test_target_state = np.array([np.cos(170.0 * np.pi/180), np.sin(170.0 * np.pi/180), 0.0])
        self.test_init_state = np.array([np.cos(0.0 * np.pi/180), np.sin(0.0 * np.pi/180), 0.0])

        # self.theta_desired = np.arctan2(self.target_state[1], self.target_state[0])  # Desired angle
        # self.theta_dot_desired = self.target_state[2]  # Desired angular velocity

        wandb.config = {
            "num_episodes": num_episodes,
            "learn_every": learn_every,
            "noise_variance": noise_variance,
            "test_every": test_every,
        }
        wandb.init(project="Pendulum-TD3", entity="shivanichepuri", config = wandb.config)

    def run(self):
        steps=0
        K = 1
        w1 = 0.5
        w2 = 0.5
        dt = 0.05   # env dt
        for episode in range(self.num_episodes):
            state = self.env.reset()
            done = False
            episode_reward = 0
            episode_e = 0
            episode_env_reward = 0
            e_init = state - self.target_state
            while not done:
                action = self.agent.act(state) + np.random.normal(0, self.noise_variance, self.env.action_space.shape)
                next_state, reward_env, done, done_, _ = self.env.step(action)

                e = next_state - self.target_state
                e_dot = (e - e_init)/dt
                reward = np.linalg.norm(e_dot + K*e)**2 
                # - w2*(float(action))**2

                done = done or done_
                self.agent.memorize(state, action, reward, next_state, not done)
                state = next_state
                if steps % self.learn_every == 0:
                    err_actor, err_critic = self.agent.learn()
                steps = (steps + 1) % self.learn_every
                if self.viz:
                    self.env.render()

                episode_reward += reward
                episode_e += np.linalg.norm(e)
                episode_env_reward += reward_env
                e_init = e
            print(f"Episode: {episode}, Reward: {episode_reward}, EpisodeError: {episode_e}, EnvReward: {episode_env_reward}")

            wandb.log({"Episode": episode, "Reward": episode_reward,
                       "Actor Loss": err_actor, "Critic Loss": err_critic,
                       "Episode Error": episode_e, "Env Reward": episode_env_reward})
            if episode % 1000 == 0:
                self.agent.TD.save(f"checkpoints/{episode}")

            if episode % self.test_every == 0:
                self.test(episode)

        self.env.close()

    def test(self, episode):
        state = self.env.reset()
        done = False
        episode_reward = 0
        K = 1
        w1 = 0.5
        w2 = 0.5
        dt = 0.05   # env dt
        episode_e = 0
        episode_env_reward = 0
        e_init = state - self.target_state

        while not done:
            action = self.agent.act(state)
            next_state, reward_env, done, done_, _ = self.env.step(action)

            e = next_state - self.target_state
            e_dot = (e - e_init)/dt
            reward = -w1*np.linalg.norm(e_dot + K*e)**2 - w2*(float(action))**2

            done = done or done_
            state = next_state
            if self.viz:
                self.env.render()
            episode_reward += reward
            episode_e += np.linalg.norm(e)
            episode_env_reward += reward_env
            e_init = e

        self.env.close()
        print(f"Test Episode: {episode}, Reward: {episode_reward}, Episode Error: {episode_e}, Env Reward: {episode_env_reward}")
        wandb.log({"Test Episode": episode, "Test Reward": episode_reward,
                   "Test Episode Error": episode_e, "Test Env Reward": episode_env_reward})

if __name__ == "__main__":
    runner = SimpleRunner(viz = False)
    runner.run()

  return LooseVersion(v) >= LooseVersion(check)
[34m[1mwandb[0m: Currently logged in as: [33mshivanichepuri[0m. Use [1m`wandb login --relogin`[0m to force relogin


Episode: 0, Reward: -17856.899041218887, EpisodeError: 1057.423503354811, EnvReward: -1123.3669941591024
1.4292553709862976
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
1.981047050345311
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
2.6066695192282516
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
3.3014025755926983
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
4.060501856145138
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
4.869805944383908
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
5.698716107099801
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
6.495138602329777
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
7.185891892619935
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
7.68795230608216
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
7.930295765191263
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
7.879569860383765
norm_e

  state = torch.FloatTensor(state).to(device)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
4.675263198392752
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
4.500735470230305
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
4.188161089891439
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
3.8230812784696386
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
3.4589237323495046
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
3.1467192257213226
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
2.867919861534274
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
2.601149358419264
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
2.345387973923201
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
2.110773692936915
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
1.9198945603004185
norm_e, ep_e <class 'numpy.float64'> <class 'numpy.float64'>
1