## Notebook to Train DQN Model


### Imports

In [11]:
import cv2
import numpy as np
import math
import os

# Import ML Libraries
from collections import namedtuple, deque
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import torch.distributions as distributions

In [12]:
model_path = "./models/"

Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state'))

class ReplayMemory(object):
    def __init__(self, capacity, batch_size):
        self.memory = deque([],maxlen=capacity)
        self.capacity = capacity
        self.batch_size = batch_size

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self):
        return random.sample(self.memory, self.batch_size)

    def __len__(self):
        return len(self.memory)

### Set Up Reward System

In [13]:
class Balloon(object):
    def __init__(self, center, radius):
        self.center = center
        self.radius = radius

    def determine_hit(self, coords):
        dist = np.linalg.norm(self.center - coords)
        if dist <= self.radius:
            return True
        return False
    
    def determine_prize(self, coords, prev_coords, shape):
        dist = np.linalg.norm(self.center - coords)
        prize = 176 - (2 * self.radius)
        scaling = 1 - (dist / self.radius)
        h_flick = abs(prev_coords[0] - coords[0]) / (0.5 * shape[0])
        w_flick = abs(prev_coords[1] - coords[1]) / (0.5 * shape[1])
        prize += min(h_flick * 50, 50) + min(w_flick * 50, 50)

        return prize * scaling

In [14]:
class Game(object):
    def __init__(self, h, w):
        self.h = h
        self.w = w
        self.screen = torch.zeros((1, 1, self.h, self.w))
        self.current_balloons = {}
        self.last_shot = np.array([self.h//2, self.w//2])

        for i in range(0, 3):
            rad = np.random.randint(40, 85)

            y_coord = np.random.randint(20, self.h-20)
            x_coord = np.random.randint(20, self.w-20)
            pos = np.array([x_coord, y_coord])

            bloon = Balloon(pos, rad)

            for y in range(y_coord - rad, y_coord + rad):
                if y < 0 or y >= self.h:
                    continue
                for x in range(x_coord - rad, x_coord + rad):
                    if x < 0 or x >= self.w:
                        continue
                    curr = np.array([y, x])
                    if np.linalg.norm(pos - curr) <= rad:
                        self.screen[0, 0, y, x] = 1

            self.current_balloons[i] = bloon
        
        self.done = False

    def update_shot(self, selected_action):
        action = np.array([selected_action[0][0] // self.h, (selected_action[0][0] % self.h)])

        reward = 0

        to_remove = []

        for i in self.current_balloons:
            balloon = self.current_balloons[i]
            if balloon.determine_hit(action):
                # Determine Reward
                reward += balloon.determine_prize(action, self.last_shot, np.array([self.h, self.w]))

                # Remove From Grid/Dictionary
                for y in range(balloon.center[0] - balloon.radius, balloon.center[0] + balloon.radius):
                    if y < 0 or y >= self.h:
                        continue
                    for x in range(balloon.center[1] - balloon.radius, balloon.center[1] + balloon.radius):
                        if x < 0 or x >= self.w:
                            continue
                        self.screen[0, 0, y, x] = 0

                to_remove.append(i)
        
        for idx in to_remove:
            self.current_balloons.pop(idx)
            print('popped balloon')

        self.last_shot = action

        next_state = self.screen

        if len(self.current_balloons) == 0:
            self.done = True

        return torch.tensor([[reward]]), next_state, self.done

In [15]:
test_game = Game(480, 640)
random_action = torch.tensor([[random.randrange(480 * 640)]])
print(random_action)
test_game.update_shot(random_action)

tensor([[100516]])


(tensor([[0]]), tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]]]]), False)

### DQN Model

In [16]:
class AimlabNetwork(nn.Module):
    def __init__(self, h, w):
        super(AimlabNetwork, self).__init__()
        self.input_channels = 1
        self.output_channels = 1
        self.hidden_size_1 = 32
        self.hidden_size_2 = 64
        self.hidden_size_3 = 128
        self.linear = h * w

        self.encoder = nn.ModuleList([
            #--- Stage 1
            nn.Conv2d(self.input_channels, self.hidden_size_1, kernel_size=5, padding=2, stride=2),
            nn.BatchNorm2d(self.hidden_size_1), 
            nn.ReLU(),
            nn.Conv2d(self.hidden_size_1, self.hidden_size_1, kernel_size=3, padding=1), 
            nn.BatchNorm2d(self.hidden_size_1), 
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2), 
            #--- Stage 2
            nn.Conv2d(self.hidden_size_1, self.hidden_size_2, kernel_size=5, padding=2, stride=2), 
            nn.BatchNorm2d(self.hidden_size_2), 
            nn.ReLU(),
            nn.Conv2d(self.hidden_size_2, self.hidden_size_2, kernel_size=5, padding=2, stride=2), 
            nn.BatchNorm2d(self.hidden_size_2), 
            nn.ReLU(),
            nn.Conv2d(self.hidden_size_2, self.hidden_size_2, kernel_size=3, padding=1), 
            nn.BatchNorm2d(self.hidden_size_2), 
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2), 
            #--- Stage 3
            nn.Conv2d(self.hidden_size_2, self.hidden_size_3, kernel_size=5, padding=2, stride=2), 
            nn.BatchNorm2d(self.hidden_size_3), 
            nn.ReLU(),
            nn.Conv2d(self.hidden_size_3, self.hidden_size_3, kernel_size=5, padding=2, stride=2), 
            nn.BatchNorm2d(self.hidden_size_3), 
            nn.ReLU(),
            nn.Conv2d(self.hidden_size_3, self.hidden_size_3, kernel_size=3, padding=1), 
            nn.BatchNorm2d(self.hidden_size_3), 
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        ])

        self.fcn = nn.Linear(512, h * w)
        self.act = nn.Softmax(dim=1)



    def forward(self, x):
        out = x
        for i, l in enumerate(self.encoder):
            out = l(out)
        out = out.flatten(1)
        out = self.act(self.fcn(out))
        return out

In [17]:
test_model = AimlabNetwork(480, 640)
rand_tensor = torch.rand((1, 1, 480, 640))
output = test_model(rand_tensor)

In [18]:
class AimlabTrainer(object):
    def __init__(self, h, w):
        self.h = h
        self.w = w

        """
        Model Parameters
        """
        self.batch_size = 1
        self.gamma = 0.999
        self.eps_start = 0.9
        self.eps_end = 0.05
        self.eps_decay = 200
        self.target_update = 10

        self.steps_done = 0

        self.model = AimlabNetwork(h, w)
        self.target_model = AimlabNetwork(h, w)
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

        self.memory = ReplayMemory(10, 1)
        self.optimizer = optim.RMSprop(self.model.parameters())


    def select_action(self, state):
        sample = random.random()
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
            math.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        if sample > eps_threshold:
            action = self.model(state).max(1)[1].view(1, 1)
        else:
            action = torch.tensor([[random.randrange(self.h * self.w)]])
        return action
    

    def optimize(self):
        if len(self.memory) < self.memory.capacity:
            return
        transitions = self.memory.sample()
        batch = Transition(*zip(*transitions))

        if batch.next_state is not None:
            non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.bool)
            non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)                     

        # Compute Q(s_t, a)
        state_action_values = self.model(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_state_values = torch.zeros(1)
        next_state_values[non_final_mask] = self.target_model(non_final_next_states).max(1)[0].detach()
        # Compute expected Q
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch
        
        # Compute Huber loss
        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values, expected_state_action_values)

        # Optimize Model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()


    def train(self):
        num_episodes = 10
        for i_episode in range(num_episodes):
            if i_episode % 2 == 0:
                print(f"Starting Episode {i_episode}")

            game = Game(self.h, self.w)
            last_screen = game.screen
            current_screen = game.screen
            state = current_screen - last_screen
            
            done = False
            while not done:
                action = self.select_action(state)
                reward, next_state, done = game.update_shot(action)

                # Move to Next State
                last_screen = current_screen
                current_screen = next_state
                if done:
                    next_state = None
                else:
                    next_state = current_screen - last_screen

                # Store Transition in Memory
                self.memory.push(state, action, reward, next_state)

                # Perform Optimization
                self.optimize()
    

    def evaluate(self):
        print("Starting Evaluation")
        game = Game(self.h, self.w)
        state = game.screen
        done = False
        shots = 0
        while not done:
            action = self.model(state).detach()
            reward, next_state, done = game.update_shot(action)
            if reward > 0:
                print(f"balloon popped in {shots} shots")

            # Move to Next State
            if done:
                state = None
            else:
                state = next_state
            shots += 1
    

IndentationError: expected an indented block (<ipython-input-18-cb54069bb809>, line 47)

## Train Model

In [184]:
model = AimlabTrainer(480, 640)
model.train()
model.evaluate()

Starting Episode 0


  action = np.array([selected_action[0][0] // self.h, (selected_action[0][0] % self.h)])


popped balloon
popped balloon
popped balloon


NotImplementedError: There were no tensor arguments to this function (e.g., you passed an empty list of Tensors), but no fallback function is registered for schema aten::_cat.  This usually means that this function requires a non-empty list of Tensors, or that you (the operator writer) forgot to register a fallback function.  Available functions are [CPU, CUDA, QuantizedCPU, BackendSelect, Python, Named, Conjugate, Negative, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradLazy, AutogradXPU, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, UNKNOWN_TENSOR_TYPE_ID, Autocast, Batched, VmapMode].

CPU: registered at aten\src\ATen\RegisterCPU.cpp:18433 [kernel]
CUDA: registered at aten\src\ATen\RegisterCUDA.cpp:26496 [kernel]
QuantizedCPU: registered at aten\src\ATen\RegisterQuantizedCPU.cpp:1068 [kernel]
BackendSelect: fallthrough registered at ..\aten\src\ATen\core\BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at ..\aten\src\ATen\core\PythonFallbackKernel.cpp:47 [backend fallback]
Named: registered at ..\aten\src\ATen\core\NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at ..\aten\src\ATen\ConjugateFallback.cpp:18 [backend fallback]
Negative: registered at ..\aten\src\ATen\native\NegateFallback.cpp:18 [backend fallback]
ADInplaceOrView: fallthrough registered at ..\aten\src\ATen\core\VariableFallbackKernel.cpp:64 [backend fallback]
AutogradOther: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradCPU: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradCUDA: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradXLA: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradLazy: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradXPU: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradMLC: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradHPU: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradNestedTensor: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradPrivateUse1: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradPrivateUse2: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
AutogradPrivateUse3: registered at ..\torch\csrc\autograd\generated\VariableType_3.cpp:10141 [autograd kernel]
Tracer: registered at ..\torch\csrc\autograd\generated\TraceType_3.cpp:11560 [kernel]
UNKNOWN_TENSOR_TYPE_ID: fallthrough registered at ..\aten\src\ATen\autocast_mode.cpp:466 [backend fallback]
Autocast: fallthrough registered at ..\aten\src\ATen\autocast_mode.cpp:305 [backend fallback]
Batched: registered at ..\aten\src\ATen\BatchingRegistrations.cpp:1016 [backend fallback]
VmapMode: fallthrough registered at ..\aten\src\ATen\VmapModeRegistrations.cpp:33 [backend fallback]


In [183]:
model.evaluate()

Starting Evaluation


  action = np.array([selected_action[0][0] // self.h, (selected_action[0][0] % self.h)])


KeyboardInterrupt: 