In [3]:
!pip3 uninstall gym --yes
!pip3 install swig
!pip3 install 'gym[box2d]'

Found existing installation: gym 0.26.2
Uninstalling gym-0.26.2:
  Successfully uninstalled gym-0.26.2
Collecting gym[box2d]
  Using cached gym-0.26.2-py3-none-any.whl
Installing collected packages: gym
Successfully installed gym-0.26.2


In [4]:
import numpy as np
import gym
from collections import deque
import random

# Ornstein-Ulhenbeck Process
# Taken from #https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py
class OUNoise(object):
    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu

    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state

    def get_action(self, action, t=0):
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(action + ou_state, self.low, self.high)

class NormalNoiseProcess:
    def __init__(self, action_space, var, decay, min_sigma):
        action_shape = action_space.shape

        self.mean   = np.zeros(action_shape)
        self.sigma = var
        self.sigma_decay = decay
        self.min_sigma = min_sigma

    def sample(self):
        return np.random.normal(loc = self.mean, scale=self.sigma, size=self.mean.shape)

    def decay(self):
        self.sigma = max(self.min_sigma, self.sigma - self.sigma_decay)

# https://github.com/openai/gym/blob/master/gym/core.py
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b



class Memory:
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = deque(maxlen=max_size)

    def push(self, state, action, reward, next_state, done):
        experience = (state, action, np.array([reward]), next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        state_batch = []
        action_batch = []
        reward_batch = []
        next_state_batch = []
        done_batch = []

        batch = random.sample(self.buffer, batch_size)

        for experience in batch:
            state, action, reward, next_state, done = experience
            state_batch.append(state)
            action_batch.append(action)
            reward_batch.append(reward)
            next_state_batch.append(next_state)
            done_batch.append(done)

        return state_batch, action_batch, reward_batch, next_state_batch, done_batch

    def __len__(self):
        return len(self.buffer)

In [63]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd
from torch.autograd import Variable

class Critic(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Critic, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        f1 = 1./np.sqrt(self.linear1.weight.data.size()[0])
        nn.init.uniform_(self.linear1.weight.data, -f1, f1)
        nn.init.uniform_(self.linear1.bias.data, -f1, f1)
        self.bn1 = nn.LayerNorm(hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        f2 = 1./np.sqrt(self.linear2.weight.data.size()[0])
        nn.init.uniform_(self.linear2.weight.data, -f2, f2)
        nn.init.uniform_(self.linear2.bias.data, -f2, f2)
        self.bn2 = nn.LayerNorm(hidden_size)
        self.linear3 = nn.Linear(hidden_size, hidden_size)
        f3 = 1./np.sqrt(self.linear3.weight.data.size()[0])
        nn.init.uniform_(self.linear3.weight.data, -f3, f3)
        nn.init.uniform_(self.linear3.bias.data, -f3, f3)
        self.bn3 = nn.LayerNorm(hidden_size)
        self.linear4 = nn.Linear(hidden_size, output_size)
        f4 = 1./np.sqrt(self.linear4.weight.data.size()[0])
        nn.init.uniform_(self.linear4.weight.data, -f4, f4)
        nn.init.uniform_(self.linear4.bias.data, -f4, f4)

    def forward(self, state, action):
        """
        Params state and actions are torch tensors
        """
        x = torch.cat([state, action], 1)
        x = self.bn1(F.relu(self.linear1(x)))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)

        return x

class Actor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Actor, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        f1 = 1./np.sqrt(self.linear1.weight.data.size()[0])
        nn.init.uniform_(self.linear1.weight.data, -f1, f1)
        nn.init.uniform_(self.linear1.bias.data, -f1, f1)
        self.bn1 = nn.LayerNorm(hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        f2 = 1./np.sqrt(self.linear2.weight.data.size()[0])
        nn.init.uniform_(self.linear2.weight.data, -f2, f2)
        nn.init.uniform_(self.linear2.bias.data, -f2, f2)
        self.bn2 = nn.LayerNorm(hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        f3 = 1./np.sqrt(self.linear3.weight.data.size()[0])
        nn.init.uniform_(self.linear3.weight.data, -f3, f3)
        nn.init.uniform_(self.linear3.bias.data, -f3, f3)

    def forward(self, state):
        """
        Param state is a torch tensor
        """
        x = self.bn1(F.relu(self.linear1(state)))
        x = F.relu(self.linear2(x))
        x = torch.tanh(self.linear3(x))
        return x

In [64]:
import torch
import torch.autograd
import torch.optim as optim
import torch.nn as nn
# from model import *
# from utils import *
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device",device)

class DDPGagent:
    def __init__(self, env, hidden_size=256, actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-2, max_memory_size=50000):
        # Params
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau

        # Networks
        self.actor = Actor(self.num_states, hidden_size, self.num_actions).to(device)
        self.actor_target = Actor(self.num_states, hidden_size, self.num_actions).to(device)
        self.critic = Critic(self.num_states + self.num_actions, hidden_size, self.num_actions).to(device)
        self.critic_target = Critic(self.num_states + self.num_actions, hidden_size, self.num_actions).to(device)


        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data)
            target_param.requires_grad = False

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)
            target_param.requires_grad = False

        # Training
        self.memory = Memory(max_memory_size)
        self.critic_criterion  = nn.MSELoss()
        self.actor_optimizer  = optim.Adam(self.actor.parameters(), lr=actor_learning_rate, step_size=1000, gamma=0.9, lr=1e-4, weight_decay=1e-5)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate, step_size=1000, gamma=0.9, lr=1e-3, weight_decay=1e-5)

    def get_action(self, state):
        state = Variable(torch.from_numpy(state).float().unsqueeze(0)).to(device)
        action = self.actor.forward(state)
        action = action.detach().cpu().numpy()[0,0]
        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.memory.sample(batch_size)
        states = torch.FloatTensor(states).to(device)
        actions = torch.FloatTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).to(device)

        # Implement critic loss and update critic
        with torch.no_grad():
          Q_target = rewards + self.gamma * self.critic_target(next_states,self.actor_target(next_states))

        Q_current = self.critic(states,actions)
        critic_loss = self.critic_criterion(Q_target,Q_current)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()


        # Implement actor loss and update actor

        # We don't want to update critic in actor update, hence saving some computation
        for parameters in self.critic.parameters():
            parameters.requires_grad = False

        actor_loss = -self.critic(states,self.actor(states)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        for parameters in self.critic.parameters():
            parameters.requires_grad = True

        # update target networks
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)

device cpu


In [65]:
import sys
import gym
from gym.envs import box2d
import numpy as np
import matplotlib.pyplot as plt

env = NormalizedEnv(gym.make("LunarLander-v2", continuous= True))
#  hidden_size=256, actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-2, max_memory_size=50000
# hidden_size=512, actor_learning_rate=5e-8, critic_learning_rate=5e-6, gamma=0.98, tau=5e-2
agent = DDPGagent(env, hidden_size=512, actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-1, max_memory_size=50000)
noise = OUNoise(env.action_space)
batch_size = 64
rewards = []
avg_rewards = []

for episode in range(10000):
    state, _ = env.reset()
    noise.reset()
    episode_reward = 0

    while True:
        action = agent.get_action(state)
        #Add noise to action

        action = noise.get_action(action)
        new_state, reward, done, info, other = env.step(action)
        agent.memory.push(state, action, reward, new_state, done)

        if len(agent.memory) > batch_size:
            agent.update(batch_size)

        state = new_state
        episode_reward += reward

        if done:
            sys.stdout.write("episode: {}, reward: {}, average _reward: {} \n".format(episode, np.round(episode_reward, decimals=2), np.mean(rewards[-10:])))
            torch.save(agent.actor.state_dict(), 'weights/ddpg.pth')
            break

    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

episode: 0, reward: -273.76, average _reward: nan 
episode: 1, reward: -612.91, average _reward: -273.7580365963993 
episode: 2, reward: -383.87, average _reward: -443.33211943995286 
episode: 3, reward: -100.52, average _reward: -423.5120641800716 
episode: 4, reward: -178.24, average _reward: -342.76425663592363 
episode: 5, reward: -176.42, average _reward: -309.85884041766656 
episode: 6, reward: -207.82, average _reward: -287.6188316533285 
episode: 7, reward: -99.57, average _reward: -276.2184686594823 
episode: 8, reward: -268.38, average _reward: -254.13775280738338 
episode: 9, reward: -226.93, average _reward: -255.7198905648151 
episode: 10, reward: -272.49, average _reward: -252.8407901782738 
episode: 11, reward: -249.28, average _reward: -252.71430575924904 
episode: 12, reward: -169.09, average _reward: -216.35208328580688 
episode: 13, reward: -420.44, average _reward: -194.87369066675615 
episode: 14, reward: -322.61, average _reward: -226.86541432091076 
episode: 15, 

Your Infrence

In [15]:
env = NormalizedEnv(gym.make("LunarLander-v2", continuous= True,render_mode='human'))
# agent = DDPGagent(env)
# agent.actor.load_state_dict(torch.load('weights/ddpg.pth', map_location=torch.device(device)))
agent.actor.eval()

rewards = []
avg_rewards = []

for episode in range(10):
    state, _ = env.reset()
    noise.reset()
    episode_reward = 0
    while True:
        action = env.action_space.sample()
        new_state, reward, done, info, other = env.step(action)
        state = new_state
        episode_reward += reward
        env.render()
        if done:
            sys.stdout.write("episode: {}, reward: {}, average _reward: {} \n".format(episode, np.round(episode_reward, decimals=2), np.mean(rewards[-100:])))
            break

    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))
env.close()
plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode: 0, reward: -166.07, average _reward: nan 
episode: 1, reward: -165.76, average _reward: -166.06975625048614 
episode: 2, reward: -187.01, average _reward: -165.91600317439983 
episode: 3, reward: -280.34, average _reward: -172.94886991819658 
episode: 4, reward: -508.02, average _reward: -199.7970432075702 
episode: 5, reward: -98.85, average _reward: -261.4412221522504 
episode: 6, reward: -309.97, average _reward: -234.34259216181795 
episode: 7, reward: -122.83, average _reward: -245.14645033970524 


KeyboardInterrupt: 

In [None]:
torch.save(agent.actor.state_dict(), 'ddpg.pth')