In [417]:
import gym
import math
import random
import cv2
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [418]:
env = gym.make('ALE/Breakout-v5')
#env = gym.make('ALE/SpaceInvaders-v5')



numAcciones = env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training with:", device)

Training with: cuda


In [419]:
def get_image():
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen)

    resize = T.Compose([T.ToPILImage(),
                    #T.Resize(95, interpolation=Image.CUBIC),
                    T.Grayscale(num_output_channels=1),
                    T.ToTensor()])

    screen = resize(screen).unsqueeze(0)
    return screen

In [420]:
class Memory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def save(self, state, action, next_state, reward):
        self.memory.append((state, action, next_state, reward))

    def sample(self, batch_size):
        indices     = np.random.choice(len(self.memory), batch_size, replace=False)
        
        states      = []
        actions     = []
        next_states = []
        rewards      = []


        for idx in indices: 
            states.append(self.memory[idx][0])
            actions.append(self.memory[idx][1])
            next_states.append(self.memory[idx][2])
            rewards.append(self.memory[idx][3])
        
        return states, actions, rewards, next_states
        #return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [421]:
memory = Memory(10000)
BATCH_SIZE = 128
EPS_START = 1.0
EPS_DECAY = .999985
EPS_MIN = 0.02
epsilon = EPS_START
gamma = 0.99

In [422]:
def optimize_model():

    if len(memory) < BATCH_SIZE:
        return

    batch = memory.sample(BATCH_SIZE)
    estado, accion, recompensa, estado_sig = batch

    estado = np.array(estado)
    estado_sig = np.array(estado_sig)

    try:
        listaNones = np.where(estado_sig == None)
    except:
        listaNones = []

    #tensor_estados     = torch.Tensor(estado).to('cuda')
    tensor_accion       = torch.Tensor(accion).to(device)
    tensor_recompensa   = torch.Tensor(recompensa).to(device)
    #tensor_estado_sig  = torch.Tensor(estado_sig).to(device)

    estado_sig[listaNones] = estado[listaNones]

    Qvalues = [red_politica(e).max(1)[0].item() for e in estado]
    
    #Qvalues = [red_politica(e).gather(1, tensor_accion.unsqueeze(-1)).squeeze(-1) for e in estado]
    #print(Qvalues)

    
    QpValues = [red_objetivo(e).max(1)[0].item() for e in estado_sig]
    QpValues = np.array(QpValues)
    QpValues[listaNones] = 0.0


    # Qvalues = torch.Tensor(Qvalues).to(device)
    # QpValues = torch.Tensor(QpValues).to(device)
    torch.Tensor(loquesea, requires_grad = True)

    valorEsperado = QpValues * gamma + tensor_recompensa

    Qvalues.retain_grad()
    valorEsperado.retain_grad()
    criterion = nn.SmoothL1Loss()
    loss = criterion(Qvalues, valorEsperado.unsqueeze(1))
    loss.backward()


    for param in red_politica.parameters():
        param.grad.data.clamp_(-1, 1)

    # loss = nn.MSELoss() 
    # output = loss(Qvalues, valorEsperado)
    # optimizer.zero_grad()
    # output.backward()
    optimizer.step()

In [423]:
class DQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=2)
        self.bn3 = nn.BatchNorm2d(32)
        
        """Para calcular correctamente la salida, tenemos que linealizarla, esto depende de las dimensiones
        de las imagenes de entrada y de los parÃ¡metros introducidos"""
        def conv2d_size_out(size, kernel_size = 3, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)

    """Devuelve un vector con el valor de las acciones posibles"""
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

In [424]:
screen = get_image()
_, _, screen_height, screen_width = screen.shape

red_politica = DQN(screen_height, screen_width, numAcciones).to(device)
red_objetivo = DQN(screen_height, screen_width, numAcciones).to(device)
red_objetivo.load_state_dict(red_politica.state_dict())
red_objetivo.eval()

optimizer = optim.RMSprop(red_politica.parameters())

In [425]:
def action_selection(state):
    global epsilon
    e = epsilon
    
    if epsilon-EPS_DECAY > EPS_MIN:
        epsilon = epsilon-EPS_DECAY
    else:
        epsilon = epsilon
        
    if random.randint(0, 100)/100 < e:
        return random.randrange(numAcciones)
    else:
        with torch.no_grad():
            return  red_politica(state).max(1)[1]

In [426]:
episodios = 1
for i in range(episodios):
    env.reset()
    screen_1 = get_image()
    screen_2 = get_image()

    estado = screen_2-screen_1

    for j in count():
        accion = action_selection(estado)
        estadoPrueba, recompensa, done, _ = env.step(accion)
        #recompensa = torch.tensor([recompensa], device=device)
        
        screen1 = screen_2
        screen_2 = get_image()
        
        if not done:
            sig_estado = screen_2-screen_1
        else:
            sig_estado = None
            break
            
        memory.save(estado, accion, sig_estado, recompensa)

        estado = sig_estado

        optimize_model()

        if done:
            break
        if j == BATCH_SIZE:
            break

    print("Partida {} acabada".format(i))
if i % 10 == 0:
    red_objetivo.load_state_dict(red_politica.state_dict())

  estado = np.array(estado)
  estado = np.array(estado)
  estado_sig = np.array(estado_sig)
  estado_sig = np.array(estado_sig)


RuntimeError: can't retain_grad on Tensor that has requires_grad=False

In [None]:
torch.save(red_objetivo.state_dict(), "RedObjetivo.pt")

In [None]:
a = np.array([None,2,3,54,None, None])

listaNones = np.where( a == None)
a = [None,2,3,54,None, None]
a[listaNones] = 2

print(a)

TypeError: list indices must be integers or slices, not tuple