## Notebook to Train DQN Model


### Imports

In [744]:
import cv2
import numpy as np
import math
import os

# Import ML Libraries
from collections import namedtuple, deque
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import torch.distributions as distributions

In [745]:
model_path = "./models/"

Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state'))

class ReplayMemory(object):
    def __init__(self, capacity, batch_size):
        self.memory = deque([],maxlen=capacity)
        self.capacity = capacity
        self.batch_size = batch_size

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self):
        return random.sample(self.memory, self.batch_size)

    def __len__(self):
        return len(self.memory)

### Set Up Reward System

In [746]:
class Balloon(object):
    def __init__(self, center, depth):
        self.center = center
        self.depth = depth
        self.alive = True
    
    def determine_prize(self, prev_coords, shape, max_depth):
        prize = 100
        scaling = self.depth / max_depth
        h_flick = abs(prev_coords[0] - self.center[0]) / (0.5 * shape[0])
        w_flick = abs(prev_coords[1] - self.center[1]) / (0.5 * shape[1])
        prize += min(h_flick * 50, 50) + min(w_flick * 50, 50)

        return prize * scaling

In [815]:
class Game(object):
    def __init__(self, h, w):
        self.h = h
        self.w = w
        self.max_depth = 0
        self.balloons_popped = 0
        self.current_balloons = {}
        self.last_shot = np.array([self.h//2, self.w//2])
        self.state = []

        for i in range(0, 3):
            depth = np.random.uniform(1, 4)
            if depth > self.max_depth:
                self.max_depth = depth

            y_coord = np.random.randint(20, self.h-20)
            x_coord = np.random.randint(20, self.w-20)
            pos = np.array([y_coord, x_coord])

            bloon = Balloon(pos, depth)
            
            self.current_balloons[i] = bloon

            self.state.append([y_coord/self.h, x_coord/self.w, depth])
        
        for i in range(0, 3):
            self.state[i][2] /= self.max_depth

        self.done = False

    def determine_top_score(self):
        max_reward = 0
        perms = [[0, 1, 2], [0, 2, 1], [1, 2, 0], [1, 0, 2], [2, 0, 1], [2, 1, 0]]
        size = np.array([self.h, self.w])
        rewards = {}

        for p in perms:
            reward = 0

            shot = np.array([self.h//2, self.w//2])

            for b in p:
                ball = self.current_balloons[b]
                reward += ball.determine_prize(shot, size, self.max_depth)
                shot = ball.center

            if reward > max_reward:
                max_reward = reward
            rewards[str(p)] = reward

        return max_reward, rewards
    
    def restart_game(self):
        self.last_shot = np.array([self.h//2, self.w//2])
        self.state = []
        for b in self.current_balloons:
            balloon = self.current_balloons[b]
            balloon.alive = True
            self.state.append([balloon.center[0]/self.h, balloon.center[1]/self.w, balloon.depth/self.max_depth])
        self.balloons_popped = 0
        self.done = False

    def update_shot(self, selected_balloon):
        balloon_params = self.state[selected_balloon]
        balloon = None
        for b_idx in self.current_balloons:
            ball_check = self.current_balloons[b_idx]
            if (balloon_params[0] == ball_check.center[0]/self.h) and (balloon_params[1] == ball_check.center[1]/self.w) \
                    and (balloon_params[2] == ball_check.depth/self.max_depth):
                balloon = ball_check

        if not balloon.alive:
            reward = -50
        else:
            reward = balloon.determine_prize(self.last_shot, [self.h, self.w], self.max_depth)
            self.state.pop(selected_balloon)
            balloon.alive = False
            self.balloons_popped += 1

        next_state = self.state
        self.last_shot = [balloon.center[0], balloon.center[1]]

        if self.balloons_popped == 3:
            self.done = True

        return torch.tensor([[reward]]), next_state, self.done

In [816]:
test_game = Game(480, 640)
# print("First State")
# print(test_game.state)
random_action = torch.tensor([0])
print(f"Action: {random_action.item()}")
print(test_game.state)
test_game.restart_game()
best_score, rewards = test_game.determine_top_score()
print(f"Best Score: {best_score}")
print(f"All Rewards: {rewards}")
print(best_score)
print(test_game.state)

Action: 0
[[0.15, 0.8328125, 0.9541180789390029], [0.7145833333333333, 0.41875, 0.9549312033070895], [0.8458333333333333, 0.86875, 1.0]]
Best Score: 505.0809934031854
All Rewards: {'[0, 1, 2]': 501.4649836044517, '[0, 2, 1]': 465.1524294943876, '[1, 2, 0]': 428.41474211558113, '[1, 0, 2]': 459.9610819754622, '[2, 0, 1]': 500.7847076533684, '[2, 1, 0]': 505.0809934031854}
505.0809934031854
[[0.15, 0.8328125, 0.9541180789390029], [0.7145833333333333, 0.41875, 0.9549312033070895], [0.8458333333333333, 0.86875, 1.0]]


### DQN Model

In [851]:
class AimlabNetwork(nn.Module):
    def __init__(self):
        super(AimlabNetwork, self).__init__()
        self.input_channels = 1

        self.lstm = nn.LSTM(3, 3, num_layers = 2, batch_first=True)
        self.encoder = nn.ModuleList([
            nn.Linear(3, 1),
            nn.ReLU(),
        ])
        # self.lstm2 = nn.LSTM(3, 1, num_layers = 2, batch_first=True)

    def forward(self, x):
        out, hidden = self.lstm(x)
        for i, l in enumerate(self.encoder):
            out = l(out)
        # out, hidden2 = self.lstm2(out)
        out = out.flatten(1)

        return out

In [852]:
test_network = AimlabNetwork()
test_state = torch.rand((1, 4, 3))
#test_state = torch.tensor([[[230, 320, 1],
#                           [100, 300, 2],
#                           [412, 420, 4],
#                           [0, 1, 2.7]]])
print(test_state.shape)
output = test_network(test_state)
print(output)
print(output.shape)
print(output.max(1)[1].view(1, 1))

torch.Size([1, 4, 3])
tensor([[0.3958, 0.3981, 0.4018, 0.4013]], grad_fn=<ReshapeAliasBackward0>)
torch.Size([1, 4])
tensor([[2]])


In [853]:
class AimlabTrainer(object):
    def __init__(self, h, w):
        self.h = h
        self.w = w

        """
        Model Parameters
        """
        self.batch_size = 1
        self.gamma = 0.999
        self.eps_start = 0.9
        self.eps_end = 0.05
        self.eps_decay = 200
        self.target_update = 10

        self.steps_done = 0

        self.model = AimlabNetwork().float()
        self.target_model = AimlabNetwork().float()
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

        self.memory = ReplayMemory(50, 1)
        self.optimizer = optim.RMSprop(self.model.parameters())


    def select_action(self, state):
        sample = random.random()
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
            math.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        if sample > eps_threshold:
            action = self.model(state.float()).max(1)[1].view(1, 1)
        else:
            action = torch.tensor([[random.randint(0, len(state)-1)]])
        return action
    

    def optimize(self):
        transitions = self.memory.sample()
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
           
        # Compute Q(s_t, a)
        state_action_values = self.model(state_batch.float()).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        if batch.next_state[0] is None:
            next_state_values = torch.zeros(1)
        else:
            next_state_batch = torch.cat(batch.next_state)  
            next_state_values = self.target_model(next_state_batch.float()).max(1)[0].detach()
        # Compute expected Q
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch
        
        # Compute Huber loss
        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values.float(), expected_state_action_values.float())

        # Optimize Model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self):
        num_episodes = 10000
        for i_episode in range(num_episodes):
            if i_episode % 2500 == 0:
                print(f"Starting Episode {i_episode}")

            game = Game(self.h, self.w)

            for replay in range(0, 20):
                game.restart_game()
                state = torch.tensor(game.state, dtype=torch.double)
                state = state.reshape(1, -1, 3)
            
                done = False
                while not done:
                    action = self.select_action(state)
                    reward, next_state, done = game.update_shot(action)

                    next_state = torch.tensor(next_state, dtype=torch.double)
                    next_state = next_state.reshape(1, -1, 3)

                    if done:
                        next_state = None

                    # Store Transition in Memory
                    self.memory.push(state, action, reward, next_state)

                    # State Transition
                    state = next_state

                    # Perform Optimization
                    self.optimize()


    def evaluate_once(self):
        print("Starting Evaluation")
        game = Game(self.h, self.w)
        state = game.state
        print(f"Game State: {state}")
        done = False
        total_reward = 0
        order = []
        while not done:
            state = torch.tensor(state, dtype=torch.double)
            state = state.reshape(1, -1, 3)
            action = self.model(state.float()).max(1)[1].view(1, 1)
            print(f"action: {self.model(state.float())}")
            reward, next_state, done = game.update_shot(action)
            order.append(action)
            total_reward += reward

            # Move to Next State
            state = next_state

        # Diagnostics
        top_score, rewards = game.determine_top_score()
        print(f"Reward {total_reward.item()} / {top_score}")
        print(f"Order Taken: {order}")
        print(f"All Rewards: {rewards}")

    def evaluate_accuracy(self):
        print("Starting Evaluation")
        total_right = 0
        total_top3 = 0
        for i in range(0, 1000):
            game = Game(self.h, self.w)
            state = game.state
            done = False
            total_reward = 0
            order = []
            while not done:
                state = torch.tensor(state, dtype=torch.double)
                state = state.reshape(1, -1, 3)
                action = self.model(state.float()).max(1)[1].view(1, 1)
                reward, next_state, done = game.update_shot(action)
                order.append(action)
                total_reward += reward

                # Move to Next State
                state = next_state

            # Diagnostics
            top_score, rewards = game.determine_top_score()
            average_reward = sum(rewards.values()) / 6
            if total_reward == top_score:
                total_right += 1
            if total_reward >= average_reward:
                total_top3 += 1
                
        print(f"Accuracy: {total_right/10}%")
        print(f"Top 3 Accuracy: {total_top3/10}%")
    

## Train Model

In [854]:
model = AimlabTrainer(480, 640)
model.train()
torch.save(model.model.state_dict(), model_path + "balloon_popping_model_2.pt")

Starting Episode 0
Starting Episode 2500
Starting Episode 5000
Starting Episode 7500


In [855]:
model.evaluate_accuracy()

Starting Evaluation
Accuracy: 14.0%
Top 3 Accuracy: 49.8%


In [866]:
model.evaluate_once()

Starting Evaluation
Game State: [[0.9479166666666666, 0.1, 1.0], [0.40208333333333335, 0.4578125, 0.8081645445599602], [0.85625, 0.6546875, 0.9128481388447398]]
action: tensor([[143.6201, 101.3737, 151.2311]], grad_fn=<ReshapeAliasBackward0>)
action: tensor([[143.6201, 101.3737]], grad_fn=<ReshapeAliasBackward0>)
action: tensor([[142.3966]], grad_fn=<ReshapeAliasBackward0>)
Reward 447.23413443926916 / 485.64851721787517
Order Taken: [tensor([[2]]), tensor([[0]]), tensor([[0]])]
All Rewards: {'[0, 1, 2]': 485.64851721787517, '[0, 2, 1]': 463.51799575857285, '[1, 2, 0]': 402.020874384353, '[1, 0, 2]': 423.21542189329574, '[2, 0, 1]': 447.23413443926916, '[2, 1, 0]': 457.1382321376908}
