## Importing Libraries

In [1]:
import gym
import random
import os
import pandas as pd
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

## display The Lunar Lander problem 

In [2]:
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset(seed=42)
for _ in range(1000):
   action = env.action_space.sample()
   observation, reward, terminated, truncated, info = env.step(action)

   if terminated or truncated:
      observation, info = env.reset()

env.close()

  if not isinstance(terminated, (bool, np.bool8)):


## Creating the agent class

In [3]:
env = gym.make("LunarLander-v2")

SEED = 0

def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seeds(SEED)

In [6]:
class DQLAgent:
    def __init__(self, env, gamma=0.99):
        self.gamma = gamma
        self.epsilon = 1.0
        self.epsilon_decay = 0.998
        self.epsilon_min = 0.01
        self.tot_reward = []
        self.batch_size = 64
        self.memory = deque(maxlen=500000)
        self.osn = env.observation_space.shape[0]
        self.opt = Adam(learning_rate=0.001)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(64, input_dim=self.osn, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=self.opt)
        return model

    def act(self, state):
        if random.random() <= self.epsilon:
            return env.action_space.sample()
        action = self.model.predict(state, verbose=0)
        return np.argmax(action[0])

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay_batch(self):
        batch = random.sample(self.memory, self.batch_size)

        state = np.squeeze(np.array([i[0] for i in batch]))
        action = np.array([i[1] for i in batch])
        reward = np.array([i[2] for i in batch])
        next_state = np.squeeze(np.array([i[3] for i in batch]))
        done = np.array([i[4] for i in batch])

        q_val = reward + self.gamma * np.amax(self.model.predict_on_batch(next_state), axis=1) * (1 - done)
        target = self.model.predict_on_batch(state)
        idx = np.arange(self.batch_size)
        target[[idx], [action]] = q_val

        self.model.fit(state, target, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state = env.reset()
            state = np.reshape(np.array(state).flatten(), [1, self.osn])
            t_reward = 0
            max_steps = 1000
            for t in range(max_steps):
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                next_state = np.reshape(np.array(next_state).flatten(), [1, self.osn])
                t_reward += reward
                self.remember(state, action, reward, next_state, done)
                state = next_state
                if done:
                    break
            self.replay()
            if e % 100 == 0:
                print(f"Episode: {e}, Total reward: {t_reward}")
        self.save_model('LunarLanderDQL.h5')

    def save_model(self, name):
        self.model.save(name)

In [7]:
agent = DQLAgent(env)
episodes = 400
agent.learn(episodes)
agent.save_model('LunarLanderDQL.h5')

df = pd.DataFrame(np.array(agent.tot_reward), columns=['total_reward'])
df['total_reward_50episodes_moving_average'] = df['total_reward'].rolling(window=50).mean()
df.plot(title='Total reward per episode')
plt.xlabel('Episode', fontsize=12)
plt.show()

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.