In [1]:
import env
import numpy as np
import torch
import numpy as np
import matplotlib.pyplot as plt

from dqn_agent import Agent
from collections import deque

In [2]:
STATE_SIZE = (84, 84, 1)

ACTION_MAP = {
    0: "IDLE",
    1: "TURN_LEFT",
    2: "TURN_RIGHT",
    3: "LEFT",
    4: "RIGHT",
    5: "FORWARD",
    6: "BACKWARD",
    7: "SHOOT",
}
ACTION_SIZE = 8

In [3]:
environment = env.Environment('bevystein.exe', STATE_SIZE)

In [4]:
from model import VisualQNetwork

agent = Agent(
    state_size=STATE_SIZE, 
    action_size=ACTION_SIZE, 
    seed=0, 
    double=True, 
    priority_replay=False,
    q_network=VisualQNetwork,
)

In [5]:
def augment_state(frames, actions):
    action_t_minus_1, action_t = actions[-1], actions[0]
    pix_t_minus_1 = frames[0]
    pix_t = frames[1]
    pix_t_plus_1 = frames[2]
    
    action_t_minus_1 = np.ones((84, 84)) * action_t_minus_1
    action_t = np.ones((84, 84)) * action_t
           
    state = np.stack([
        pix_t_minus_1, 
        pix_t,
        pix_t_plus_1,
    ])
    
    return state

def dqn(n_episodes=100, max_t=10000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=max_t)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    
    for i_episode in range(1, n_episodes+1):

        environment.reset()
        
        framebuffer = deque(maxlen=3)
        action_buffer = deque(maxlen=2)
        
        # get initial 3-frame observation
        state = np.squeeze(environment.visual_observations())
        for _ in range(0, 3):
            framebuffer.append(state)
        for _ in range(0, 2):
            action_buffer.append(0)
        
        state = augment_state(list(framebuffer), list(action_buffer))
        
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps).astype(int)
            action_buffer.append(action)
            (result, screen) = environment.step(ACTION_MAP[action])
            framebuffer.append(np.squeeze(screen))
            next_state = augment_state(list(framebuffer), list(action_buffer))
            reward = result['reward']
            done = result['is_terminated']
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done: break 

        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            torch.save(agent.qnetwork_local.state_dict(), "visual_q_network.pth")
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

    return scores

In [6]:
def train_dqn(n_episodes):
    scores = dqn(n_episodes=n_episodes)
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()
    

In [7]:
train_dqn(n_episodes=20)

Episode 37	Average Score: 2.97

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))