The setting is same as Problem 2. However, you have to implement a neural network based Q-
learning solution rather than implementing a policy gradients solution. You are required to experiment with
only the CartPole-v1 environment. Answer all the questions that have been asked in Problem 2 for this
setting. The plot must be named as “bonus.png”

In [1]:
#!/usr/bin/env python3
import gym
import ptan
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

defining the hyperparameters

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.01
BATCH_SIZE = 8

EPSILON_START = 1.0
EPSILON_STOP = 0.02
EPSILON_STEPS = 5000

REPLAY_BUFFER = 50000

defining the neural network for the Q-learning

In [3]:
class DQN(nn.Module):
    def __init__(self, input_size, n_actions):
        super(DQN, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        return self.net(x)

Calculating the target state based on the best Qvalue. The net returns the action value of all actions
in action space.If we are done then we return local reward, else we go for bellman update.

In [4]:
def calc_target(net, local_reward, next_state):
    if next_state is None:
        return local_reward
    state_v = torch.tensor([next_state], dtype=torch.float32)
    next_q_v = net(state_v)
    best_q = next_q_v.max(dim=1)[0].item()
    return local_reward + GAMMA * best_q


In [8]:


if __name__ == "__main__":
    
    
    env = gym.make("CartPole-v1")
    writer = SummaryWriter(comment="-cartpole-dqn")

    net = DQN(env.observation_space.shape[0], env.action_space.n)
    print(net)
    
    '''we again make use of the ptan library'''
    
    
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=EPSILON_START)
    agent = ptan.agent.DQNAgent(net, selector, preprocessor=ptan.agent.float32_preprocessor)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
    replay_buffer = ptan.experience.ExperienceReplayBuffer(exp_source, REPLAY_BUFFER)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    mse_loss = nn.MSELoss()

    total_rewards = []
    step_idx = 0
    done_episodes = 0

    while True:
        step_idx += 1
        selector.epsilon = max(EPSILON_STOP, EPSILON_START - step_idx / EPSILON_STEPS)
        replay_buffer.populate(1)

        if len(replay_buffer) < BATCH_SIZE:
            continue

        # sample batch
        batch = replay_buffer.sample(BATCH_SIZE)
        batch_states = [exp.state for exp in batch]
        batch_actions = [exp.action for exp in batch]
        batch_targets = [calc_target(net, exp.reward, exp.last_state)
                         for exp in batch]
        # train
        optimizer.zero_grad()
        states_v = torch.FloatTensor(batch_states)
        net_q_v = net(states_v)
        target_q = net_q_v.data.numpy().copy()
        target_q[range(BATCH_SIZE), batch_actions] = batch_targets
        target_q_v = torch.tensor(target_q)
        loss_v = mse_loss(net_q_v, target_q_v)
        loss_v.backward()
        optimizer.step()

        # handle new rewards
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            done_episodes += 1
            reward = new_rewards[0]
            total_rewards.append(reward)
            mean_rewards = float(np.mean(total_rewards[-100:]))
            print("%d: reward: %6.2f, mean_100: %6.2f, epsilon: %.2f, episodes: %d" % (
                step_idx, reward, mean_rewards, selector.epsilon, done_episodes))
            writer.add_scalar("reward", reward, step_idx)
            writer.add_scalar("reward_100", mean_rewards, step_idx)
            writer.add_scalar("epsilon", selector.epsilon, step_idx)
            writer.add_scalar("episodes", done_episodes, step_idx)
            if mean_rewards > 195:
                print("Solved in %d steps and %d episodes!" % (step_idx, done_episodes))
                break
    writer.close()

DQN(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)
10: reward:   9.00, mean_100:   9.00, epsilon: 1.00, episodes: 1
41: reward:  31.00, mean_100:  20.00, epsilon: 0.99, episodes: 2
58: reward:  17.00, mean_100:  19.00, epsilon: 0.99, episodes: 3
68: reward:  10.00, mean_100:  16.75, epsilon: 0.99, episodes: 4
101: reward:  33.00, mean_100:  20.00, epsilon: 0.98, episodes: 5
128: reward:  27.00, mean_100:  21.17, epsilon: 0.97, episodes: 6
145: reward:  17.00, mean_100:  20.57, epsilon: 0.97, episodes: 7
166: reward:  21.00, mean_100:  20.62, epsilon: 0.97, episodes: 8
181: reward:  15.00, mean_100:  20.00, epsilon: 0.96, episodes: 9
203: reward:  22.00, mean_100:  20.20, epsilon: 0.96, episodes: 10
250: reward:  47.00, mean_100:  22.64, epsilon: 0.95, episodes: 11
263: reward:  13.00, mean_100:  21.83, epsilon: 0.95, episodes: 12
282: reward:  19.00, mean_100:  21.62, ep

2257: reward:  20.00, mean_100:  17.96, epsilon: 0.55, episodes: 120
2269: reward:  12.00, mean_100:  17.88, epsilon: 0.55, episodes: 121
2278: reward:   9.00, mean_100:  17.72, epsilon: 0.54, episodes: 122
2305: reward:  27.00, mean_100:  17.78, epsilon: 0.54, episodes: 123
2327: reward:  22.00, mean_100:  17.73, epsilon: 0.53, episodes: 124
2348: reward:  21.00, mean_100:  17.70, epsilon: 0.53, episodes: 125
2374: reward:  26.00, mean_100:  17.84, epsilon: 0.53, episodes: 126
2384: reward:  10.00, mean_100:  17.73, epsilon: 0.52, episodes: 127
2396: reward:  12.00, mean_100:  17.74, epsilon: 0.52, episodes: 128
2407: reward:  11.00, mean_100:  17.72, epsilon: 0.52, episodes: 129
2426: reward:  19.00, mean_100:  17.68, epsilon: 0.51, episodes: 130
2437: reward:  11.00, mean_100:  17.61, epsilon: 0.51, episodes: 131
2457: reward:  20.00, mean_100:  17.48, epsilon: 0.51, episodes: 132
2467: reward:  10.00, mean_100:  17.31, epsilon: 0.51, episodes: 133
2477: reward:  10.00, mean_100:  1

4974: reward:  57.00, mean_100:  24.06, epsilon: 0.02, episodes: 240
5012: reward:  38.00, mean_100:  24.25, epsilon: 0.02, episodes: 241
5074: reward:  62.00, mean_100:  24.76, epsilon: 0.02, episodes: 242
5148: reward:  74.00, mean_100:  25.38, epsilon: 0.02, episodes: 243
5184: reward:  36.00, mean_100:  25.66, epsilon: 0.02, episodes: 244
5221: reward:  37.00, mean_100:  25.90, epsilon: 0.02, episodes: 245
5267: reward:  46.00, mean_100:  26.25, epsilon: 0.02, episodes: 246
5317: reward:  50.00, mean_100:  26.63, epsilon: 0.02, episodes: 247
5359: reward:  42.00, mean_100:  26.92, epsilon: 0.02, episodes: 248
5411: reward:  52.00, mean_100:  27.30, epsilon: 0.02, episodes: 249
5524: reward: 113.00, mean_100:  28.32, epsilon: 0.02, episodes: 250
5593: reward:  69.00, mean_100:  28.90, epsilon: 0.02, episodes: 251
5664: reward:  71.00, mean_100:  29.28, epsilon: 0.02, episodes: 252
5720: reward:  56.00, mean_100:  29.73, epsilon: 0.02, episodes: 253
5754: reward:  34.00, mean_100:  2

10693: reward:  22.00, mean_100:  45.87, epsilon: 0.02, episodes: 359
10729: reward:  36.00, mean_100:  45.70, epsilon: 0.02, episodes: 360
10751: reward:  22.00, mean_100:  45.35, epsilon: 0.02, episodes: 361
10777: reward:  26.00, mean_100:  43.30, epsilon: 0.02, episodes: 362
10799: reward:  22.00, mean_100:  43.09, epsilon: 0.02, episodes: 363
10830: reward:  31.00, mean_100:  42.87, epsilon: 0.02, episodes: 364
10847: reward:  17.00, mean_100:  42.55, epsilon: 0.02, episodes: 365
10874: reward:  27.00, mean_100:  42.43, epsilon: 0.02, episodes: 366
10904: reward:  30.00, mean_100:  42.09, epsilon: 0.02, episodes: 367
10919: reward:  15.00, mean_100:  41.89, epsilon: 0.02, episodes: 368
10935: reward:  16.00, mean_100:  41.34, epsilon: 0.02, episodes: 369
10965: reward:  30.00, mean_100:  41.06, epsilon: 0.02, episodes: 370
11002: reward:  37.00, mean_100:  41.00, epsilon: 0.02, episodes: 371
11043: reward:  41.00, mean_100:  41.13, epsilon: 0.02, episodes: 372
11081: reward:  38.0

14673: reward:  24.00, mean_100:  34.45, epsilon: 0.02, episodes: 478
14701: reward:  28.00, mean_100:  34.49, epsilon: 0.02, episodes: 479
14735: reward:  34.00, mean_100:  34.60, epsilon: 0.02, episodes: 480
14758: reward:  23.00, mean_100:  34.29, epsilon: 0.02, episodes: 481
14811: reward:  53.00, mean_100:  34.46, epsilon: 0.02, episodes: 482
14865: reward:  54.00, mean_100:  34.71, epsilon: 0.02, episodes: 483
14889: reward:  24.00, mean_100:  34.69, epsilon: 0.02, episodes: 484
14950: reward:  61.00, mean_100:  35.06, epsilon: 0.02, episodes: 485
14990: reward:  40.00, mean_100:  35.15, epsilon: 0.02, episodes: 486
15036: reward:  46.00, mean_100:  35.18, epsilon: 0.02, episodes: 487
15056: reward:  20.00, mean_100:  35.07, epsilon: 0.02, episodes: 488
15072: reward:  16.00, mean_100:  34.67, epsilon: 0.02, episodes: 489
15153: reward:  81.00, mean_100:  35.16, epsilon: 0.02, episodes: 490
15186: reward:  33.00, mean_100:  34.93, epsilon: 0.02, episodes: 491
15219: reward:  33.0

19774: reward:  78.00, mean_100:  44.35, epsilon: 0.02, episodes: 596
19807: reward:  33.00, mean_100:  44.35, epsilon: 0.02, episodes: 597
19863: reward:  56.00, mean_100:  44.65, epsilon: 0.02, episodes: 598
19890: reward:  27.00, mean_100:  44.69, epsilon: 0.02, episodes: 599
19993: reward: 103.00, mean_100:  45.54, epsilon: 0.02, episodes: 600
20063: reward:  70.00, mean_100:  45.87, epsilon: 0.02, episodes: 601
20167: reward: 104.00, mean_100:  46.63, epsilon: 0.02, episodes: 602
20234: reward:  67.00, mean_100:  46.82, epsilon: 0.02, episodes: 603
20307: reward:  73.00, mean_100:  47.24, epsilon: 0.02, episodes: 604
20396: reward:  89.00, mean_100:  47.47, epsilon: 0.02, episodes: 605
20461: reward:  65.00, mean_100:  47.63, epsilon: 0.02, episodes: 606
20497: reward:  36.00, mean_100:  47.80, epsilon: 0.02, episodes: 607
20553: reward:  56.00, mean_100:  47.96, epsilon: 0.02, episodes: 608
20599: reward:  46.00, mean_100:  48.28, epsilon: 0.02, episodes: 609
20647: reward:  48.0

39242: reward: 254.00, mean_100: 183.11, epsilon: 0.02, episodes: 714
39550: reward: 308.00, mean_100: 185.57, epsilon: 0.02, episodes: 715
39896: reward: 346.00, mean_100: 187.68, epsilon: 0.02, episodes: 716
40221: reward: 325.00, mean_100: 190.41, epsilon: 0.02, episodes: 717
40529: reward: 308.00, mean_100: 192.50, epsilon: 0.02, episodes: 718
40874: reward: 345.00, mean_100: 195.51, epsilon: 0.02, episodes: 719
Solved in 40874 steps and 719 episodes!
