In [322]:
import gym
import math
import random
import cv2
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [323]:
env = gym.make('ALE/Breakout-v5')
#env = gym.make('ALE/SpaceInvaders-v5')



numAcciones = env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training with:", device)

Training with: cuda


In [324]:
def get_image():
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen)

    resize = T.Compose([T.ToPILImage(),
                    #T.Resize(95, interpolation=Image.CUBIC),
                    T.Grayscale(num_output_channels=1),
                    T.ToTensor()])

    screen = resize(screen).unsqueeze(0)
    return screen

In [325]:
class Memory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def save(self, state, action, next_state, reward):
        self.memory.append((state, action, next_state, reward))

    def sample(self, batch_size):
        indices     = np.random.choice(len(self.memory), batch_size, replace=False)
        
        states      = [self.memory[idx][0] for idx in indices]
        actions     = [self.memory[idx][1] for idx in indices]
        next_states = [self.memory[idx][2] for idx in indices]
        rewards     = [self.memory[idx][3] for idx in indices]
        
        return states, actions, rewards, next_states
        #return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [326]:
memory = Memory(10000)
BATCH_SIZE = 128
EPS_START = 1.0
EPS_DECAY = .999985
EPS_MIN = 0.02
epsilon = EPS_START
gamma = 0.99

In [327]:
def optimize_model():

    if len(memory) < BATCH_SIZE:
        return

    batch = memory.sample(BATCH_SIZE)
    estado, accion, recompensa, estado_sig = batch


    print(estado_sig[0])
    try:
        listaNones = estado_sig.index(None)
    except:
        listaNones = []

    tensor_estados      = torch.tensor(estado).to(device)
    tensor_accion       = torch.tensor(accion).to(device)
    tensor_recompensa   = torch.tensor(recompensa).to(device)
    tensor_estado_sig   = torch.tensor(estado_sig).to(device)

    tensor_estado_sig[listaNones] = tensor_estados[listaNones]

    Qvalues = red_politica(tensor_estados).max(1)[0].view(1, 1)
    print(Qvalues)
    Qvalues = red_politica(tensor_estados).gather(1, tensor_accion.unsqueeze(-1)).squeeze(-1)
    print(Qvalues)

    
    QpValues = red_objetivo(tensor_estado_sig).max(1)[0].view(1, 1)
    QpValues[listaNones] = 0.0
    

    valorEsperado = QpValues * gamma + tensor_recompensa

        
    loss = nn.MSELoss() (Qvalues, valorEsperado)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [328]:
class DQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=2)
        self.bn3 = nn.BatchNorm2d(32)
        
        """Para calcular correctamente la salida, tenemos que linealizarla, esto depende de las dimensiones
        de las imagenes de entrada y de los parámetros introducidos"""
        def conv2d_size_out(size, kernel_size = 3, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)

    """Devuelve un vector con el valor de las acciones posibles"""
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

In [329]:
screen = get_image()
_, _, screen_height, screen_width = screen.shape

red_politica = DQN(screen_height, screen_width, numAcciones).to(device)
red_objetivo = DQN(screen_height, screen_width, numAcciones).to(device)
red_objetivo.load_state_dict(red_politica.state_dict())
red_objetivo.eval()

optimizer = optim.RMSprop(red_politica.parameters())

In [330]:
def action_selection(state):
    global epsilon
    e = epsilon
    
    if epsilon-EPS_DECAY > EPS_MIN:
        epsilon = epsilon-EPS_DECAY
    else:
        epsilon = epsilon
        
    if random.randint(0, 100)/100 < e:
        return torch.tensor([[random.randrange(numAcciones)]], device=device, dtype=torch.long)
    else:
        with torch.no_grad():
            return  red_politica(state).max(1)[1].view(1, 1)

In [331]:
episodios = 1
for i in range(episodios):
    env.reset()
    screen_1 = get_image()
    screen_2 = get_image()

    estado = screen_2-screen_1

    for j in count():
        accion = action_selection(estado)
        _, recompensa, done, _ = env.step(accion.item())
        recompensa = torch.tensor([recompensa], device=device)

        screen1 = screen_2
        screen_2 = get_image()
        
        if not done:
            sig_estado = screen_2-screen_1
        else:
            sig_estado = None
            break
            
        memory.save(estado, accion, sig_estado, recompensa)

        estado = sig_estado

        optimize_model()

        if done:
            break
        if j == BATCH_SIZE:
            break

    print("Partida {} acabada".format(i))
if i % 10 == 0:
    red_objetivo.load_state_dict(red_politica.state_dict())

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]])


ValueError: only one element tensors can be converted to Python scalars

In [None]:
torch.save(red_objetivo.state_dict(), "RedObjetivo.pt")