In [1]:
import gym
import random
import numpy as np
from collections import namedtuple, deque
from itertools import count
from PIL import Image
import logging
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import pickle as p

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = gym.make('ALE/Breakout-v5', render_mode="human")

numAcciones = env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training with:", device)

Training with: cuda


In [3]:
######################################################################
#------------------- Get image from enviromet ------------------------
######################################################################

def get_image():
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen)

    resize = T.Compose([T.ToPILImage(),
                    #T.Resize(95, interpolation=Image.CUBIC),
                    T.Grayscale(num_output_channels=1),
                    T.ToTensor()])

    screen = resize(screen).unsqueeze(0)
    return screen
######################################################################

In [4]:
#######################################################################################
#--------------------------------------- Memory ---------------------------------------
#######################################################################################


class Memory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
#Renovar memoria
    def save(self, state, action, next_state, reward):
        self.memory.append((state, action, next_state, reward))

    def sample(self, batch_size):
        indices     = np.random.choice(len(self.memory), batch_size, replace=False)
        
        states      = []
        actions     = []
        next_states = []
        rewards     = []


        for idx in indices: 
            states.append(self.memory[idx][0])
            actions.append(self.memory[idx][1])
            next_states.append(self.memory[idx][2])
            rewards.append(self.memory[idx][3])
        
        return states, actions, rewards, next_states
        #return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

#######################################################################################

In [5]:
memory = Memory(10000)
BATCH_SIZE = 32
EPS_START = 0.9
EPS_DECAY = 0.045
EPS_MIN = 0.02
epsilon = EPS_START
gamma = 0.99
it = 1

In [6]:
#######################################################################################
#------------------------------- Memory Optimizer -------------------------------------
#######################################################################################

def optimize_model():

    if len(memory) < BATCH_SIZE:
        return

    batch = memory.sample(BATCH_SIZE)
    estado, accion, recompensa, estado_sig = batch

    estado = np.array(estado)
    estado_sig = np.array(estado_sig)

    try:
        listaNones = np.where(estado_sig == None)
    except:
        listaNones = []

    tensor_accion       = torch.Tensor(accion).to(device)
    tensor_recompensa   = torch.Tensor(recompensa).to(device)

    estado_sig[listaNones] = estado[listaNones]

    Qvalues = [red_politica(e).max(1)[0].item() for e in estado]
    
    QpValues = [red_objetivo(e).max(1)[0].item() for e in estado_sig]
    QpValues = np.array(QpValues)
    QpValues[listaNones] = 0.0


    Qvalues = torch.Tensor(Qvalues).to(device)
    Qvalues.requires_grad_()
    QpValues = torch.Tensor(QpValues).to(device)
    QpValues.requires_grad_()

    valorEsperado = QpValues * gamma + tensor_recompensa

    Qvalues.retain_grad()
    valorEsperado.retain_grad()
    
    

    loss = nn.MSELoss() 
    output = loss(valorEsperado, Qvalues)
    optimizer.zero_grad()
    output.backward()
    optimizer.step()

########################################################################################

In [7]:
################################################################################################################
#----------------------------------------- Estructura de la red ------------------------------------------------
################################################################################################################


class DQN(nn.Module):
    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        # Eliminar las batch
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=2)
        
        """Para calcular correctamente la salida, tenemos que linealizarla, esto depende de las dimensiones
        de las imagenes de entrada y de los parámetros introducidos"""
        def conv2d_size_out(size, kernel_size = 3, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32

        self.head = nn.Linear(linear_input_size, 256)
        self.head = nn.Linear(256, outputs)

    """Devuelve un vector con el valor de las acciones posibles"""
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

#####################################################################################################################



In [8]:
##########################################################################
#-------------------- Inicialización de redes ----------------------------
##########################################################################
screen = get_image()
_, _, screen_height, screen_width = screen.shape

red_politica = DQN(screen_height, screen_width, numAcciones).to(device)
red_objetivo = DQN(screen_height, screen_width, numAcciones).to(device)
red_objetivo.load_state_dict(red_politica.state_dict())
red_objetivo.eval()
scoreList = []
optimizer = optim.RMSprop(red_politica.parameters())
print("Redes inicializadas:")

# #----------------------- Carga red de fichero ------------------#
# pickle_in = open('listaScore','rb')                       #
# scoreList = p.load(pickle_in)                                   #
# pickle_in.close()                                               #
#                                                                 #

# # pickle_in = open('Memory','rb')                           #
# # memory.memory = p.load(pickle_in)                               #
# # pickle_in.close()                                               #
#                                                                 #
#                                                                 #
# red_politica.load_state_dict(torch.load('RedPolitica.pt'))#
# red_objetivo.load_state_dict(torch.load('RedObjetivo.pt'))#
# #---------------------------------------------------------------#
print("Redes cargadas:")

Redes inicializadas:
Redes cargadas:


In [9]:
############################################################################################
#-------------------------------------- Selector de acciones -------------------------------
############################################################################################
def action_selection(state):
    global epsilon, it
    
    epsilon = epsilon-EPS_DECAY/it
    if epsilon < EPS_MIN:
        epsilon = EPS_MIN
    
    it += 1

    if random.randint(0, 100)/100 < epsilon:
        return random.randrange(numAcciones)
    else:
        with torch.no_grad():
            return  red_politica(state).max(1)[1]

##############################################################################################

In [10]:
############################################################################################
#-------------------------------------- Bucle de entrenamiento -------------------------------
############################################################################################
print("Comienzo del entrenamiento:")

episodios = 10000000
for i in range(episodios):
    env.reset()
    screen_1 = get_image()
    screen_2 = get_image()

    estado = screen_2-screen_1

    # Cambiar la diferencia de las pantallas por un cubo con 4 pantallas juntas

    score = 0
    for j in count():
        accion = action_selection(estado)
        estadoPrueba, recompensa, done, _ = env.step(accion)
        
        score += recompensa
        screen1 = screen_2
        screen_2 = get_image()
        
        if not done:
            sig_estado = screen_2-screen_1
        else:
            sig_estado = None
            recompensa = -1000
            
        memory.save(estado, accion, sig_estado, recompensa)

        estado = sig_estado

        optimize_model()

        if done:
            scoreList.append(score)
            break
    print("Partida {} acabada, recompensa acumulada {}".format(i, score))

    if i % 10 == 0:
        red_objetivo.load_state_dict(red_politica.state_dict())
        torch.save(red_objetivo.state_dict(), "RedObjetivo.pt")
        torch.save(red_politica.state_dict(), "RedPolitica.pt")

        outputFile = open('Memory', 'wb')
        p.dump(memory.memory, outputFile)
        outputFile.close()

        outputFile = open('listaScore', 'wb')
        p.dump(scoreList, outputFile)
        outputFile.close()

Comienzo del entrenamiento:


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x15200 and 256x4)