In [1]:
import random
import torch
import numpy as np
import gym
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from torch import nn
from collections import deque

In [2]:
np.random.seed(0)
torch.manual_seed(0)
random.seed(0)

In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 
trained = True

### Replay Memory

In [4]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity) # Define a queue with maxlen "capacity"

    def push(self, state, action, next_state, reward):
        self.memory.append((state, action, next_state, reward))

    def sample(self, batch_size):
        batch_size = min(batch_size, len(self)) # Get all the samples if the requested batch_size is higher than the number of sample currently in the memory
        return random.sample(self.memory, batch_size) # Randomly select "batch_size" samples

    def __len__(self):
        return len(self.memory) # Return the number of samples currently stored in the memory

### Network

In [5]:
class DQN(nn.Module):
    def __init__(self, state_space_dim, action_space_dim):
        super().__init__()
        self.linear = nn.Sequential(nn.Linear(state_space_dim, 512), nn.ReLU(True),
                                    nn.Linear(512, 128), nn.ReLU(True),
                                    nn.Linear(128, action_space_dim))
    def forward(self, x):
        return self.linear(x)

### Params

In [6]:
GAMMA = .999
LR = 1e-3
BATCH_SIZE = 64
OPTIM_NAME = 'Adam'
loss_function = nn.SmoothL1Loss()
min_samples_for_training = 1000
update_target_every_steps = 10

### Agent

In [7]:
class Agent():
    
    def __init__(self, env_name):
        self.env = gym.make(env_name)
        self.env.seed(0)
        self.n_actions = self.env.action_space.n
        self.state_space_dim = self.env.observation_space.shape[0]
        self.buffer = ReplayMemory(capacity=10000)
        self.target_net = DQN(self.state_space_dim, self.n_actions)
        self.policy_net = DQN(self.state_space_dim, self.n_actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.gamma = GAMMA
        self.batch_size = BATCH_SIZE
        self.lr = LR
        self.optim_name = OPTIM_NAME
        self.optimizer = getattr(torch.optim, self.optim_name)(self.policy_net.parameters(), lr = self.lr)
        
    def reset(self):
        state = self.env.reset()
        return state
        
    def step(self, action):
        next_state, reward, done, _ = self.env.step(action)
        return next_state, reward, done
    
    def store(self, state, action, reward, next_state):
        self.buffer.push(state, action, reward, next_state)
        
    def choose_action(self, state, epsilon):
        
        if epsilon > 1 or epsilon < 0:
            raise Exception('The epsilon value must be between 0 and 1')
                
        # Evaluate the network output from the current state
        with torch.no_grad():
            self.policy_net.eval()
            state = torch.tensor(state, dtype=torch.float32) # Convert the state to tensor
            net_out = self.policy_net(state)

        # Get the best action (argmax of the network output)
        best_action = int(net_out.argmax())

        # Select a non optimal action with probability epsilon, otherwise choose the best action
        if random.random() < epsilon:
            # List of non-optimal actions
            non_optimal_actions = [a for a in range(self.n_actions) if a != best_action]
            # Select randomly
            action = random.choice(non_optimal_actions)
        else:
            # Select best action
            action = best_action

        return action
    
    def learn(self, loss_fn):
        # Sample the data from the replay memory
        batch = self.buffer.sample(self.batch_size)
        
        # Create tensors for each element of the batch
        states = torch.tensor(np.array([s[0] for s in batch]), dtype=torch.float32)
        actions = torch.tensor(np.array([s[1] for s in batch]), dtype=torch.int64)
        rewards = torch.tensor(np.array([s[2] for s in batch]), dtype=torch.float32)
        
        # Compute non-final state mask
        non_final_states_mask = torch.tensor([s[3] is not None for s in batch], dtype=torch.bool)
        non_final_next_states = torch.tensor(np.array([s[3] for s in batch if s[3] is not None]), dtype=torch.float32)
        
        #Compute the Q values
        self.policy_net.train()
        q_values = self.policy_net(states)
        
        # Select the proper Q value for the corresponding action taken Q(s_t, a)
        state_action_values = q_values.gather(1, actions.unsqueeze(1))
        
        #Compute the Q value for the next state using the target net
        with torch.no_grad():
            self.target_net.eval()
            q_next = self.target_net(non_final_next_states)
        #Take the max value of these Q values (choose the greedy action)
        q_next_max = torch.zeros(self.batch_size)
        q_next_max[non_final_states_mask] = q_next.max(dim=1)[0]
        
        #Compute TD target
        target = rewards+self.gamma*q_next_max
        target = target.unsqueeze(1)
        
        #Compute TD error
        error = loss_fn(state_action_values, target)
        
        # Optimize the model
        self.optimizer.zero_grad()
        error.backward()
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), 2)
        self.optimizer.step()
        
    def render(self):
        self.env.render()

In [8]:
env_name = 'LunarLander-v2'
lander = Agent(env_name)

### Exploration profile

In [None]:
### Define exploration profile
max_value = 1
num_iterations = 2000
exp_decay = 0.01
exploration_profile = [max_value * np.exp(-exp_decay * i) for i in range(num_iterations)]
exploration_profile = [0.01 if x < 0.01 else x for x in exploration_profile] # set minimum exploring rate


### Plot exploration profile
plt.figure(figsize=(12,8))
plt.plot(exploration_profile)
plt.grid()
plt.xlabel('Iteration')
plt.ylabel('Exploration profile (Epsilon)')

### Train

In [None]:
if not trained:    
    returns = []
    mean_ret = []
    patience = 100
    for num_eps, epsilon in enumerate(tqdm(exploration_profile)):
        state = lander.reset()
        score = 0
        done = False
        while not done:
            action = lander.choose_action(state, epsilon)
            next_state, reward, done = lander.step(action)
            score += reward
            if done:
                next_state = None
                returns.append(score)
                mean_ret.append(sum(returns[-100:]) / len(returns[-100:]))
                if mean_ret[-1] >= 200:
                    patience -= 1
                else: patience = 100
            lander.store(state, action, reward, next_state)
            # Update the network
            if len(lander.buffer) > min_samples_for_training: # we enable the training only if we have enough samples in the replay memory, otherwise the training will use the same samples too often
                lander.learn(loss_function)
            state = next_state
        if patience == 0:
            print("The game has been solved after " + num_eps + " episodes")
            break
        # Update the target network every target_net_update_steps episodes
        if num_eps % update_target_every_steps == 0:
            print('Updating target network...')
            lander.target_net.load_state_dict(lander.policy_net.state_dict())
            #torch.save(lander.target_net.state_dict(), 'LunarLander-DQL.torch')

        # Print the final score
        print(f"EPISODE: {num_eps + 1} - FINAL SCORE: {score} - Epsilon: {epsilon}") # Print the final score

    # Plot the results
    plt.figure(figsize=(12,8))
    plt.plot(mean_ret, 'r')
    plt.plot(returns, 'b', alpha = 0.3)
    plt.ylabel('Score', fontsize=18)
    plt.xlabel('Episode', fontsize=18)
    plt.savefig("Score_lunar.pdf", format='pdf')

### Test

In [9]:
# Initialize the Gym environment
env_name = 'LunarLander-v2'
lander = Agent(env_name)
lander.policy_net.load_state_dict(torch.load('LunarLander-DQL.torch', map_location=device))
returns = []

# Let's try for a total of 10 episodes
for num_episode in range(100): 
    # Reset the environment and get the initial state
    state = lander.reset()
    # Reset the score. The final score will be the total amount of steps before the pole falls
    score = 0
    done = False
    # Go on until the pole falls off or the score reach 490
    while not done:
      # Choose the best action (epsilon 0)
      action = lander.choose_action(state, epsilon=0)
      # Apply the action and get the next state, the reward and a flag "done" that is True if the game is ended
      next_state, reward, done = lander.step(action)
      # Visually render the environment
      lander.render()
      # Update the final score (+1 for each step)
      score += reward 
      # Set the current state for the next iteration
      state = next_state
      # Check if the episode ended (the pole fell down)
    # Print the final score
    returns.append(score)
    print(f"EPISODE {num_episode + 1} - FINAL SCORE: {score}") 
print(f"MEAN SCORE OVER 100 EPISODES IS: {np.mean(returns)}")



EPISODE 1 - FINAL SCORE: 248.9517496563277
EPISODE 2 - FINAL SCORE: 282.9224762847525
EPISODE 3 - FINAL SCORE: 124.70022897603037


KeyboardInterrupt: 