## Train a Deep Q Learning (DQN) agent on the MSPacman-v0

## Mounting google drive for saving the video

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Get the MSPackman ready from Atari

In [None]:
import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars

Collecting unrar
  Downloading unrar-0.4-py3-none-any.whl (25 kB)
Installing collected packages: unrar
Successfully installed unrar-0.4

UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from Roms.rar

Extracting  HC ROMS.zip                                                   36%  OK 
Extracting  ROMS.zip                                                      74% 99%  OK 
All OK
copying adventure.bin from ROMS/Adventure (1980) (Atari, Warren Robinett) (CX2613, CX2613P) (PAL).bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/adventure.bin
copying air_raid.bin from ROMS/Air Raid (Men-A-Vision) (PAL) ~.bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/air_raid.bin
copying alien.bin from ROMS/Alien (1982) (20th Century Fox Video Games, Douglas 'Dallas North' Neubauer) (11006) ~.bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/alien.bin
copying amidar.bin from ROMS/Amidar (1982) (Parker Brothers

## Packages
First, let’s import needed packages. Firstly, we need game (gym)
neural networks (torch.nn)
optimization (torch.optim)

In [None]:
import gym
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import deque
import numpy as np
import random
import datetime

In [None]:
reward_number = 50

DQN Algorithm
-------------
We train a policy that tries to maximize the discounted,
cumulative reward
$R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t$, where
$R_{t_0}$ is *return*. The discount, $\gamma$ is the discount, between $0$ and $1$


Q-learning tries to find a function
$Q^*: State \times Action \rightarrow \mathbb{R}$, maximizes rewards:

\begin{align}\pi^*(s) = \arg\!\max_a \ Q^*(s, a)\end{align}

However, we don't know $Q^*$. So, we use neural network as a approximators, we can simply create one and train it to resemble $Q^*$.

For our training update rule, we'll use a fact that every $Q$
function for some policy obeys the Bellman equation:

\begin{align}Q^{\pi}(s, a) = r + \gamma Q^{\pi}(s', \pi(s'))\end{align}

The difference between the two sides of the equality is known as the temporal difference error, $\delta$:

\begin{align}\delta = Q(s, a) - (r + \gamma \max_a Q(s', a))\end{align}

In [None]:
class CNN(torch.nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.convolution1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3)
        self.convolution2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=5)
        self.convolution3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=7)
        self.fc1 = nn.Linear(in_features=1792, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=256)
        self.fc4 = nn.Linear(in_features=256, out_features=32)
        self.fc5 = nn.Linear(in_features=32, out_features=9)
   
    def forward(self, x):
        x = x.cuda()
        x = F.relu(F.max_pool2d(self.convolution1(x), 3))
        x = F.relu(F.max_pool2d(self.convolution2(x), 3))
        x = F.relu(F.max_pool2d(self.convolution3(x), 3, 2))
        x = x.reshape(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

        ## Quick exercise: Please draw the CNN arch.

In [None]:
model = CNN()

##function to ensure that our code uses the GPU if available, and defaults to using the CPU if it isn't.

In [None]:
#function to ensure that our code uses the GPU if available, and defaults to using the CPU if it isn't.
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
# a function that can move data and model to a chosen device.    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

## Loss Function and Optimizer

In [None]:
criterion = ## your loss function from nn
optimizer = ## your optimizer 

## DQN Agent
### Replay Memory
We’ll be using experience replay memory for training our DQN. It stores the transitions that the agent observes, allowing us to reuse this data later. By sampling from it randomly, the transitions that build up a batch are decorrelated. It has been shown that this greatly stabilizes and improves the DQN training procedure.
### Hyperparameters

In [None]:
class DQNAgent:
    def __init__(self, action_size = 9):
        self.state_size = 3
        self.action_size = action_size
        self.memory_n = deque(maxlen=2000)
        self.memory_p = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.2
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = model

    def remember(self, state, action, reward, next_state, done):
        if reward == 0:
            self.memory_p.append((state, action, reward, next_state, done))
        else:
            self.memory_n.append((state, action, reward, next_state, done))
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state_tensor = torch.from_numpy(state).float()
        act_values = self.model(state_tensor).cpu().detach().numpy()
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        if len(agent.memory_n) > batch_size / 2:
            print("Negative batch ready:")
            minibatch_n = random.sample(self.memory_n, 5)
            minibatch_p = random.sample(self.memory_p, 59)
            minibatch = random.sample((minibatch_p+minibatch_n), batch_size)
        else:
            minibatch = random.sample(self.memory_p, batch_size)
        for state, action, reward, next_state, done in minibatch:
            ns_model = self.model(torch.from_numpy(next_state).float()).cpu().detach().numpy()
            if reward == 0:
                reward = 1.0001
                target = reward * np.amax(ns_model[0])
                target_f = ns_model
                target_f[0][np.argmax(ns_model[0])] = target                     
            else:
                reward = reward_number
                target = reward * np.amin(ns_model[0])
                target_max = 0.0001 * np.amax(ns_model[0])
                target_f = ns_model
                target_f[0][action] = target
                target_f[0][random.choice([i for i in range(0,9) if i not in [action]])] = target_max
                self.train(next_state, target_f, epochs=1)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def train(self, input, target, epochs = 1):
        input = torch.from_numpy(input).float().cuda()
        target = torch.from_numpy(target).float().cuda()
        y_pred = 0
        for t in range(1):
            y_pred = model(input)
            loss = - criterion(y_pred, target)
            # print(t, loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step() 

    def load_all(self, name):
        loaded = torch.load(name)
        self.memory_n = loaded['memory_n']
        self.memory_p = loaded['memory_p']
        self.model.load_state_dict(loaded['state'])
        
    def save_all(self, name):
        torch.save({'state': self.model.state_dict(),
                    'memory_n': self.memory_n,
                    'memory_p': self.memory_p
                   }, name)
        
    def load(self, name):
        self.model.load_state_dict(torch.load(name))
        
    def save(self, name):
        torch.save(self.model.state_dict(), name)




In [None]:
env = gym.make('MsPacman-v0')
state_size = env.observation_space.shape
action_size = env.action_space.n


In [None]:
agent = DQNAgent()

In [None]:
done = False
batch_size = 64


## Main training loop

In [None]:
EPISODES = 100
for e in range(EPISODES):
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    vw = cv2.VideoWriter('gdrive/' + "Reward_number_" + str(reward_number) + "_" + str(e) + str(datetime.datetime.now()) +  '.avi', fourcc, 4, (160,210))
    state = env.reset()
    state = np.reshape(state, (1, 210,160,3)).transpose(0,3,1,2)/255
    for time in range(1000000000):
        print(time)
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        vw.write(next_state)        
        reward = reward if not done else 10
        reward = reward if reward ==0 else 10
        if reward != 0:
            print("ATTENTION NEGATIVE REWARD",reward)
        next_state = np.reshape(next_state, (1, 210,160,3)).transpose(0,3,1,2)/255
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            vw.release()
            agent.save('gdrive' + "Reward_number_" + str(reward_number) + "_""Frames:_" + str(time) + "_Episode_" +str(e) + "_Date_" + str(datetime.datetime.now()) + '.pt')
            print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, time, agent.epsilon))
            break
        if (len(agent.memory_p) > batch_size) & (len(agent.memory_n) > batch_size/2) :
            agent.replay(batch_size)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
ATTENTION NEGATIVE REWARD 10
87
88
89
90
91
92
93
94
95
96
ATTENTION NEGATIVE REWARD 10
97
98
99
100
101
102
103
104
ATTENTION NEGATIVE REWARD 10
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
ATTENTION NEGATIVE REWARD 10
153
154
155
156
157
ATTENTION NEGATIVE REWARD 10
158
159
160
161
162
163
164
165
ATTENTION NEGATIVE REWARD 10
166
167
168
169
170
171
172
ATTENTION NEGATIVE REWARD 10
173
174
175
176
177
ATTENTION NEGATIVE REWARD 10
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
ATTENTION NEGATIVE REWARD 10
194
195
196
197
ATTENTION NEGATIVE REWARD 10
198
199
200
201
202
ATTENTIO

KeyboardInterrupt: ignored