In [1]:
import pickle
import torch
import torch.nn as nn
import numpy as np
from collections import namedtuple
import matplotlib
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [2]:
# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# initialize numpy random seed
np.random.seed(57)
random_gen = np.random.default_rng()

In [3]:
# Set up transition and ReplayMemory classes
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        # Gets a minibatch of tuples
        return random_gen.choice(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
# Load dataset and shuffle
raw_data = pickle.load(open('dataset.p', 'rb'))

dataset = []
for episode in raw_data:
    for exp in raw_data[episode]['experiences']:
        dataset.append(exp)

random_gen.shuffle(dataset)
print('Dataset has {} experiences'.format(len(dataset)))

Dataset has 60381 experiences


In [5]:
# Push entire dataset into ReplayMemory
memory = ReplayMemory(len(dataset))

for exp in dataset:
    state, action, nextstate, reward = exp
    memory.push(state, action, nextstate, reward)

# Test sample method
print(memory.sample(2))

[[array([[ 0.,  0., 16.,  4.],
       [ 0.,  0.,  8.,  4.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  2.,  0.,  2.]])
  'Down'
  array([[ 0.,  0.,  0.,  0.],
       [ 0.,  2.,  0.,  0.],
       [ 0.,  0., 16.,  8.],
       [ 0.,  2.,  8.,  2.]])
  8]
 [array([[  2.,   4.,  16.,   2.],
       [ 64., 128.,   8.,   4.],
       [  4.,  32.,  64.,   4.],
       [  2.,  16.,   8.,  16.]])
  'Down'
  array([[  2.,   4.,  16.,   2.],
       [ 64., 128.,   8.,   2.],
       [  4.,  32.,  64.,   8.],
       [  2.,  16.,   8.,  16.]])
  8]]


In [6]:
# Define network

class DQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=4, padding=1, stride=1)
        self.bn1 = nn.BatchNorm2d(8)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=2, padding=1, stride=1)
        self.bn2 = nn.BatchNorm2d(16)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size, padding=1, stride = 1):
            return (size  +2*padding - (kernel_size - 1) - 1) // stride  + 1
        
        convw = conv2d_size_out(conv2d_size_out(w, 4), 2)
        convh = conv2d_size_out(conv2d_size_out(h, 4), 2)
        linear_input_size = convw * convh * 16
        print(convw)
        self.head = nn.Linear(linear_input_size, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        return self.head(x.view(x.size(0), -1))

In [8]:
'''
    Takes a state or batch of states and converts
    them into a pytorch tensor format. Shapes should
    be (batch_size x 1 x 4 x 4) or (1 x 1 x 4 x 4)
'''
def process_state(state):
    state = torch.Tensor(state)
    if (state.shape == torch.Size([4,4])):
        desired_shape = (1, 1, *state.shape)
        state = state.view(*desired_shape)
    else:
        desired_shape = (state.shape[0], 1, state.shape[1], state.shape[2])
        state = state.view(*desired_shape)
    return state

In [7]:
network = DQN(4, 4, 4)
batch = memory.sample(1)
results = network.forward(process_state(batch[0][0]))
print(results)

4
tensor([[-0.2626,  0.4970, -0.7538, -0.3850]], grad_fn=<AddmmBackward>)


In [None]:
'''
Initialize target network and policy network parameters
Training Loop:
for num_train_iterations
    for k epochs:
        sample random minibatch of experiences
        forward pass using policy network
        calculate loss and backward pass
        update policy network parameters
    update target network
'''

In [None]:
state = dataset[1]['experiences'][0][0]
res = process_state(state)
print(res.shape)
print(res)

In [None]:
batch_states = np.array([exp[0] for exp in dataset[1]['experiences'][0:5]])
print(batch_states.shape)

In [None]:
res_states = process_state(batch_states)
print(res_states.shape)

In [None]:
print(res_states)