## Notebook to Train DQN Model


### Imports

In [13]:
import cv2
import numpy as np
import math
import os

# Import ML Libraries
from collections import namedtuple, deque
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import torch.distributions as distributions

In [14]:
model_path = "./models/"

Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state'))

class ReplayMemory(object):
    def __init__(self, capacity, batch_size):
        self.memory = deque([],maxlen=capacity)
        self.capacity = capacity
        self.batch_size = batch_size

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self):
        return random.sample(self.memory, self.batch_size)

    def __len__(self):
        return len(self.memory)

### Set Up Reward System

In [15]:
class Balloon(object):
    def __init__(self, center, depth):
        self.center = center
        self.depth = depth
        self.alive = True
    
    def determine_prize(self, prev_coords, shape, max_depth):
        prize = 100
        scaling = self.depth / max_depth
        h_flick = abs(prev_coords[0] - self.center[0]) / (0.5 * shape[0])
        w_flick = abs(prev_coords[1] - self.center[1]) / (0.5 * shape[1])
        prize += min(h_flick * 50, 50) + min(w_flick * 50, 50)

        return prize * scaling

In [16]:
class Game(object):
    def __init__(self, h, w):
        self.h = h
        self.w = w
        self.max_depth = 0
        self.balloons_popped = 0
        self.current_balloons = {}
        self.last_shot = np.array([self.h//2, self.w//2])
        self.state = [[self.h//2, self.w//2, 0]]

        for i in range(0, 3):
            depth = np.random.uniform(1, 4)
            if depth > self.max_depth:
                self.max_depth = depth

            y_coord = np.random.randint(20, self.h-20)
            x_coord = np.random.randint(20, self.w-20)
            pos = np.array([y_coord, x_coord])

            bloon = Balloon(pos, depth)

            self.current_balloons[i] = bloon

            self.state.append([y_coord, x_coord, depth])

        self.done = False
        self.state = np.array(self.state, dtype=np.double)
    
    def restart_game(self):
        self.balloons_popped = 0
        self.last_shot = np.array([self.h//2, self.w//2])
        self.state[0] = [self.h//2, self.w//2, 0]
        for b in self.current_balloons:
            balloon = self.current_balloons[b]
            balloon.alive = True
            self.state[b+1][2] = balloon.depth

    def update_shot(self, selected_balloon):
        ball_idx = selected_balloon[0].item()
        balloon = self.current_balloons[ball_idx]
        self.state[0] = [balloon.center[0], balloon.center[1], 0]
        if not balloon.alive:
            reward = -50
        else:
            reward = balloon.determine_prize(self.last_shot, [self.h, self.w], self.max_depth)
            self.state[ball_idx+1][2] = 0
            balloon.alive = False
            self.balloons_popped += 1

        next_state = self.state
        self.last_action = [balloon.center[0], balloon.center[1]]

        if self.balloons_popped == 3:
            self.done = True

        return torch.tensor([[reward]]), next_state, self.done

In [17]:
test_game = Game(480, 640)
print("First State")
print(test_game.state)
random_action = torch.tensor([random.randrange(3)])
print(f"Action: {random_action.item()}")
test_game.update_shot(random_action)
test_game.restart_game()
print(test_game.state)

First State
[[240.         320.           0.        ]
 [ 37.         439.           2.71539459]
 [203.         179.           3.32974312]
 [236.         595.           2.91904951]]
Action: 2
[[240.         320.           0.        ]
 [ 37.         439.           2.71539459]
 [203.         179.           3.32974312]
 [236.         595.           2.91904951]]


### DQN Model

In [18]:
class AimlabNetwork(nn.Module):
    def __init__(self):
        super(AimlabNetwork, self).__init__()
        self.input_channels = 1

        self.lstm = nn.LSTM(3, 1, num_layers = 1, batch_first=True)
        self.encoder = nn.ModuleList([
            nn.Linear(1, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),            
        ])
        self.lstm2 = nn.LSTM(1024, 1, num_layers = 1, batch_first=True)
        self.fcn = nn.Linear(4, 3)
        self.act = nn.Softmax(dim=1)

    def forward(self, x):
        out, hidden = self.lstm(x)
        for i, l in enumerate(self.encoder):
            out = l(out)
        out, hidden2 = self.lstm2(out)
        out = out.flatten(1)
        out = self.fcn(out)
        out = self.act(out)

        return out

In [19]:
test_network = AimlabNetwork()
test_state = torch.rand((1, 4, 3))
test_state = torch.tensor([[[230, 320, 1],
                           [100, 300, 2],
                           [412, 420, 4],
                           [0, 1, 2.7]]])
print(test_state.shape)
output = test_network(test_state)
print(output.shape)
print(output.max(1)[1].view(1, 1))

torch.Size([1, 4, 3])
torch.Size([1, 3])
tensor([[1]])


In [44]:
class AimlabTrainer(object):
    def __init__(self, h, w):
        self.h = h
        self.w = w

        """
        Model Parameters
        """
        self.batch_size = 1
        self.gamma = 0.999
        self.eps_start = 0.9
        self.eps_end = 0.05
        self.eps_decay = 200
        self.target_update = 10

        self.steps_done = 0

        self.model = AimlabNetwork().float()
        self.target_model = AimlabNetwork().float()
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

        self.memory = ReplayMemory(1000, 1)
        self.optimizer = optim.RMSprop(self.model.parameters())


    def select_action(self, state):
        sample = random.random()
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
            math.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        if sample > eps_threshold:
            action = self.model(state.float()).max(1)[1].view(1, 1)
        else:
            action = torch.tensor([[random.randint(0, 2)]])
        return action
    

    def optimize(self):
        transitions = self.memory.sample()
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
           
        # Compute Q(s_t, a)
        state_action_values = self.model(state_batch.float()).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        if batch.next_state[0] is None:
            next_state_values = torch.zeros(1)
        else:
            next_state_batch = torch.cat(batch.next_state)  
            next_state_values = self.target_model(next_state_batch.float()).max(1)[0].detach()
        # Compute expected Q
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch
        
        # Compute Huber loss
        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values.float(), expected_state_action_values.float())

        # Optimize Model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self):
        num_episodes = 1000
        for i_episode in range(num_episodes):
            if i_episode % 100 == 0:
                print(f"Starting Episode {i_episode}")

            game = Game(self.h, self.w)

            for replay in range(0, 100):
                game.restart_game()
                state = torch.tensor(game.state, dtype=torch.double)
                state = state.reshape(1, 4, 3)
            
                done = False
                while not done:
                    action = self.select_action(state)
                    reward, next_state, done = game.update_shot(action)

                    next_state = torch.tensor(next_state, dtype=torch.double)
                    next_state = next_state.reshape(1, 4, 3)

                    if done:
                        next_state = None

                    # Store Transition in Memory
                    self.memory.push(state, action, reward, next_state)

                    # State Transition
                    state = next_state

                    # Perform Optimization
                    self.optimize()
    

    def evaluate(self):
        print("Starting Evaluation")
        game = Game(self.h, self.w)
        state = game.state
        done = False
        total_reward = 0
        while not done:
            state = torch.tensor(state, dtype=torch.double)
            state = state.reshape(1, 4, 3)
            print(state)
            action = self.model(state.float()).max(1)[1].view(1, 1)
            print(self.model(state.float()))
            reward, next_state, done = game.update_shot(action)
            total_reward += reward

            # Move to Next State
            state = next_state
        print(f"Reward {total_reward}")
    

## Train Model

In [46]:
model = AimlabTrainer(480, 640)
model.train()

Starting Episode 0
Starting Episode 100
Starting Episode 200
Starting Episode 300
Starting Episode 400
Starting Episode 500
Starting Episode 600
Starting Episode 700


KeyboardInterrupt: 

In [48]:
model.evaluate()

Starting Evaluation
tensor([[[240.0000, 320.0000,   0.0000],
         [220.0000,  82.0000,   2.4946],
         [221.0000, 598.0000,   3.5726],
         [336.0000, 527.0000,   2.0290]]], dtype=torch.float64)
tensor([[2.8734e-10, 2.8896e-10, 1.0000e+00]], grad_fn=<SoftmaxBackward0>)
tensor([[[336.0000, 527.0000,   0.0000],
         [220.0000,  82.0000,   2.4946],
         [221.0000, 598.0000,   3.5726],
         [336.0000, 527.0000,   0.0000]]], dtype=torch.float64)
tensor([[2.8734e-10, 2.8896e-10, 1.0000e+00]], grad_fn=<SoftmaxBackward0>)
tensor([[[336.0000, 527.0000,   0.0000],
         [220.0000,  82.0000,   2.4946],
         [221.0000, 598.0000,   3.5726],
         [336.0000, 527.0000,   0.0000]]], dtype=torch.float64)
tensor([[2.8734e-10, 2.8896e-10, 1.0000e+00]], grad_fn=<SoftmaxBackward0>)
tensor([[[336.0000, 527.0000,   0.0000],
         [220.0000,  82.0000,   2.4946],
         [221.0000, 598.0000,   3.5726],
         [336.0000, 527.0000,   0.0000]]], dtype=torch.float64)
tensor(

KeyboardInterrupt: 