# Lunar Lander with Cross-Entropy Method

In this notebook we look at the lunar lander environment and solve it with the cross-entropy method.

In [1]:
#!pip3 install 'gymnasium[box2d]'

In [2]:
import gymnasium as gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
%matplotlib inline
from collections import deque

torch.manual_seed(1)
np.random.seed(1)

# Neural Network

We define a simple neural network that generates the action scores based on a given state.

In [5]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# Generate Episodes

We generate a batch of episodes and remember the traversed states, actions and rewards. To select the next action we use the output of the network. For this we first pass the scores through a softmax to get probabilites. In the second step we sampel from this distribution to get the next action to execute.

In [14]:
def generate_batch(env, batch_size, t_max=5000):
    
    
    batch_actions,batch_states, batch_rewards = [],[],[]

    
    for b in range(batch_size):
        states, actions = [], []
        total_reward = 0
        s, _ = env.reset(seed=0)
        for t in range(t_max):
            
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)

            new_s, r, done, _, _ = env.step(a)

            # record sessions like you did before
            states.append(s)
            actions.append(a)
            total_reward += r

            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break

            rewards_with_idx = []    
            for (idx, item) in enumerate(batch_rewards):
                rewards_with_idx.append((item, idx))
            
            rewards_with_idx.sort(key=lambda r: r[0], reverse=True)
            top20 = [item[1] for item in rewards_with_idx[:20]]


            batch_states = [x for (idx, x) in enumerate(batch_states) if idx in top20]
            batch_actions = [x for (idx, x) in enumerate(batch_actions) if idx in top20]
            batch_rewards = [x for (idx, x) in enumerate(batch_rewards) if idx in top20]
                
    

    return batch_states, batch_actions, batch_rewards

# Training

In the training step, we first use the neural network to generate a batch of episodes and then use the state-action pairs to improve the neural network.

In [15]:
batch_size = 100
session_size = 100
hidden_size = 200
completion_score = 100
learning_rate = 0.01

env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(n_states, hidden_size, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)

for i in range(session_size):
    
    # generate new episodes
    states, actions, rewards = generate_batch(env, batch_size, t_max=500)
    
    
    # train on the states using actions as targets
    for s_i in range(len(states)):
            
        optimizer.zero_grad()
        tensor_states = torch.FloatTensor(states[s_i])
        tensor_actions = torch.LongTensor(actions[s_i])
        action_scores_v = net(tensor_states)
        loss_v = objective(action_scores_v, tensor_actions)
        loss_v.backward()
        optimizer.step()

    #show results
    mean_reward = np.mean(rewards)
    threshold = 0.1
    print("%d: loss=%.3f, reward_mean=%.1f, threshold=%.1f" % (
            i, loss_v.item(), mean_reward, threshold))
    
    #check if 
    if np.mean(rewards)> completion_score:
        print("Environment has been successfullly completed!")
        break

0: loss=1.364, reward_mean=3.2, threshold=0.1
1: loss=1.405, reward_mean=-23.3, threshold=0.1
2: loss=1.407, reward_mean=20.6, threshold=0.1
3: loss=1.348, reward_mean=7.8, threshold=0.1
4: loss=1.307, reward_mean=28.7, threshold=0.1
5: loss=1.327, reward_mean=26.6, threshold=0.1
6: loss=1.374, reward_mean=31.0, threshold=0.1
7: loss=1.231, reward_mean=29.8, threshold=0.1
8: loss=1.339, reward_mean=-0.5, threshold=0.1
9: loss=1.378, reward_mean=41.3, threshold=0.1
10: loss=1.297, reward_mean=25.0, threshold=0.1
11: loss=1.361, reward_mean=39.3, threshold=0.1
12: loss=1.357, reward_mean=33.5, threshold=0.1
13: loss=1.337, reward_mean=37.6, threshold=0.1
14: loss=1.328, reward_mean=49.0, threshold=0.1
15: loss=1.282, reward_mean=56.8, threshold=0.1
16: loss=1.351, reward_mean=53.9, threshold=0.1
17: loss=1.282, reward_mean=42.4, threshold=0.1
18: loss=1.327, reward_mean=52.7, threshold=0.1
19: loss=1.341, reward_mean=59.9, threshold=0.1
20: loss=1.405, reward_mean=54.6, threshold=0.1
21:

In [16]:
!pip install moviepy
!pip install ffmpeg --upgrade



In [None]:
env = gym.make('LunarLander-v2', render_mode="rgb_array")
env = gym.wrappers.RecordVideo(env, "video")

state, _ = env.reset()
total_reward = 0.0
done = False
action
tensor_states = torch.FloatTensor([state])
action_scores_v = net(tensor_states)
while not done:
        tensor_states = torch.FloatTensor([state])
        action_scores_v = net(tensor_states)

         act_probs_v = activation(net(s_v))
        new_s, r, done, _, _ = env.step(a)
        
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        action_values = dqn_agent.q_network(state)
        action = np.argmax(action_values.cpu().data.numpy())

        state, reward, done, _, _ = env.step(action)
        total_reward += reward

env.close()
print(f"Total reward: {total_reward}")