In [None]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [18]:
class DeepQNetwork(nn.Module):
  def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
    super(DeepQNetwork,self).__init__()
    self.input_dims = input_dims
    self.lr = lr
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.n_actions = n_actions

    self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
    self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
    self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)

    self.optimizer = optim.Adam(self.parameters(), lr = lr)
    self.loss = nn.MSELoss()
    self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
    self.to(self.device)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.relu(self.fc2(x))
    actions = self.fc3(x)
    return actions

class Agent():
  def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, max_mem_size=100000, eps_end=0.01, eps_dec = 5e-4):
    self.gamma = gamma
    self.epsilon = epsilon
    self.eps_min = eps_end
    self.eps_dec = eps_dec
    self.lr = lr
    self.action_space = [i for i in range(n_actions)]
    self.mem_size = max_mem_size
    self.batch_size = batch_size
    self.mem_cntr = 0

    self.Q_eval = DeepQNetwork(self.lr,
                               input_dims=input_dims,
                               fc1_dims = 256,
                               fc2_dims = 256,
                               n_actions=n_actions)

    self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
    self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
    self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
    self.reward_memory = np.zeros(self.mem_size, dtype = np.float32)
    self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool_)

  def store_transition(self, state, action, reward, state_, done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state
    self.new_state_memory[index] = state_
    self.reward_memory[index] = reward
    self.action_memory[index] = action
    self.terminal_memory[index] = done

    self.mem_cntr += 1

  def choose_action(self, observation):
    if np.random.random()>self.epsilon:
      state = T.tensor([observation]).to(self.Q_eval.device)
      actions = self.Q_eval.forward(state)
      action = T.argmax(actions).item()
    else:
      action = np.random.choice(self.action_space)
    return action

  def learn(self):
    if self.mem_cntr<self.batch_size:
      return

    self.Q_eval.optimizer.zero_grad()
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, self.batch_size, replace = False)
    batch_index = np.arange(self.batch_size, dtype=np.int32)

    state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
    new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
    reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
    terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

    action_batch = self.action_memory[batch]

    q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
    q_next = self.Q_eval.forward(new_state_batch)
    q_next[terminal_batch] = 0.0

    q_target = reward_batch + self.gamma + T.max(q_next, dim=1)[0]
    loss  = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
    loss.backward()
    self.Q_eval.optimizer.step()

    if self.epsilon>self.eps_min:
      self.epsilon -= self.eps_dec
    else:
      self.epsilon = self.eps_dec





In [3]:
import gym

In [None]:
!pip install box2d

In [10]:
env = gym.make('LunarLander-v2')

  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  deprecation(
  deprecation(


In [19]:
agent = Agent(gamma = 0.99, epsilon = 1.0, batch_size = 64, n_actions = 4, eps_end = 0.01, input_dims = [8], lr = 0.03 )
scores, eps_history = [],[]
n_games = 500
for i in range(n_games):
  score = 0
  done = False
  observation = env.reset()
  while not done:
    action = agent.choose_action(observation)
    observation_, reward, done, info = env.step(action)
    score += reward
    agent.store_transition(observation, action, reward, observation_, done)
    agent.learn()
    observation = observation_
  scores.append(score)
  eps_history.append(agent.epsilon)

  avg_score = np.mean(scores[-100:])
  print("episode: ", i, "score: %.2f" % score,"average score: %.2f" % avg_score, "epsilom: %.2f" %agent.epsilon )

  if not isinstance(terminated, (bool, np.bool8)):


episode:  0 score: -120.63 average score: -120.63 epsilom: 0.99


  state = T.tensor([observation]).to(self.Q_eval.device)


episode:  1 score: -105.03 average score: -112.83 epsilom: 0.96
episode:  2 score: -73.16 average score: -99.61 epsilom: 0.91
episode:  3 score: -100.51 average score: -99.83 epsilom: 0.88
episode:  4 score: -85.36 average score: -96.94 epsilom: 0.83
episode:  5 score: -128.75 average score: -102.24 epsilom: 0.77
episode:  6 score: -405.33 average score: -145.54 epsilom: 0.73
episode:  7 score: -86.40 average score: -138.15 epsilom: 0.66
episode:  8 score: -272.45 average score: -153.07 epsilom: 0.60
episode:  9 score: -35.53 average score: -141.32 epsilom: 0.57
episode:  10 score: -356.71 average score: -160.90 epsilom: 0.49
episode:  11 score: -224.81 average score: -166.22 epsilom: 0.43
episode:  12 score: -555.18 average score: -196.14 epsilom: 0.32
episode:  13 score: -82.58 average score: -188.03 epsilom: 0.29
episode:  14 score: 4.26 average score: -175.21 epsilom: 0.22
episode:  15 score: -18.32 average score: -165.41 epsilom: 0.16
episode:  16 score: -626.20 average score: -19

In [20]:
T.save(agent.Q_eval.state_dict(), 'dqn_model.pth')

In [None]:
T.save(agent, 'dqn_agent.pth')

In [None]:
agent = T.load('dqn_agent.pth')
agent.Q_eval.eval()

In [26]:
import matplotlib.pyplot as plt

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import time
env = gym.make('LunarLander-v2')
observation = env.reset()
done = False
agent.epsilon = 0.01
plt.ion()
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(np.zeros((400, 600, 3), dtype=np.uint8))  #

while not done:
    # Predict the action from the trained model
    action = agent.choose_action(observation)
    observation_, reward, done, info = env.step(action)
    frame = env.render(mode='rgb_array')
    im.set_data(frame)
    plt.draw()
    plt.pause(0.1)
    plt.show()
    observation = observation_
env.close()
plt.ioff()
plt.show()
