In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym

In [2]:
class DeepQNetwork(nn.Module):
    def __init__(self, alpha):
        super(DeepQNetwork, self).__init__()
        self.conv1 = nn.Conv2d(1,  32, 8, stride=4, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 128, 3)
        self.fcl = nn.Linear(128*19*8, 512)
        self.fc2 = nn.Linear(512,6)
        
        self.optimizer = optim.RMSprop(self.parameters(), lr=alpha)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, observation):
        observation = T.Tensor(observation).to(self.device)
        observation = observation.view(-1, 1, 185, 95)
        observation = F.relu(self.conv1(observation))
        observation = F.relu(self.conv2(observation))
        observation = F.relu(self.conv3(observation))
        observation = observation.view(-1, 128*19*8)
        observation = F.relu(fcl(observation))
        actions = self.fc2(observation)
        return actions

In [3]:
class Agent(object):
    def __init__(self, gamma, epsilon, alpha, max_memory_size, eps_end=0.05, replace=10000, action_space=[0,1,2,3,4,5]):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_end = eps_end
        self.action_space = action_space
        self.mem_size = max_memory_size
        self.steps = 0
        self.learn_step_counter = 0
        self.memory = []
        self.mem_cntr = 0
        self.replace_target_cnt = replace
        self.q_eval = DeepQNetwork(alpha)
        self.q_next = DeepQNetwork(alpha)
        
    def store_transition(self, state, action, reward, state_):
        if self.mem_cntr < self.mem_size:
            self.memory.append([state, action, reward, state_])
        else:
            self.memory[self.mem_cntr%self.mem_size] = [state, action, reward, state_]
        self.mem_cntr += 1
        
    def choose_action(self, observation):
        rand = np.action.random()
        actions = self.q_eval.forward(observation)
        if rand < 1-self.epsilon:
            action = T.argmax(actions[1]).item()
        else:
            action = np.random.choise(self.action_space)
        self.step += 1
        return action
    
    def learn(self, batch_size):
        self.q_eval.optimizer.zero_grad()
        if self.replace_target_cnt is not None and self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self-q_eval-state_dict())
            
        if self.mem_cntr + batch_size < self.mem_size:
            mem_start = int(np.random.choice(range(self.mem_cntr)))
        else:
            mem_start = int(np.random.choice(range(self.mem_cntr-batch_size-1)))
        mini_batch = self.memory[mem_start:mem_start + batch_size]
        memory = np.array(mini_batch)
        
        q_pred = self.q_eval.forward(list(memory[:, 0][:])).to(self.q_eval.device)
        q_next = self-q_next.forward(list(memory[:, 3][:])).to(self.q_eval.device)
        
        max_a = T.maxarg(q_next, dim=1).to(self.q_eval.device)
        rewards = T.Tensor(list(memory[:, 2])).to(self.q_eval.device)
        
        q_target = q_pred
        q_target[:, max_a] = rewards + self.gamma*T.max(q_next[1])
        
        if self.steps > 500:
            if self.epsilon -1e-4 > self.eps_end:
                self.epsilon -= 1e-4
            else:
                self.epsilon = self.eslf.eps_end
        
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1
        

In [None]:
env = gym.make('SpaceInvaders-v0')
brain = Agent(gamma=0.95, epsilon=1.0, alpha=0.03, max_memory_size=5000, replace=None)
while brain.mem_cntr < brain.mem_size:
    observation = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        observation_, reward, done, info = env.step(action)
        if done and info['ale.lives'] == 0:
            reward = -100
        brain.store_transition(np.mean(observation[15:200, 30:125], axis=2), action, reward, 
                               np.mean(observation_[15:200, 30:125], axis=2))
        observation = observation_
print('done initializing memory')

scores = []
eps_history = []
num_games = 50
batch_size = 32

for i in range(num_games):
    print('starting game ', i + 1, 'epsilon: %.4f' % brain.epsilon)
    eps_history.append(brain.epsilon)
    done = False
    observation = env.reset()
    frames = [np.sum(observation[15:200, 30:125], axis=2)]
    score = 0
    last_action = 0
    while not done:
        if len(frames) == 3:
            action = brain.choose_action(frames)
            frames=[]
        else:
            action = last_action
            
    observation_, reward, done, info = env.step(action)
    score += reward
    frames.append(np.mean(observation_[15:200, 30:125], axis=2))
    if done and info['ale.lives'] == 0:
        reward = -100
    brain.store_transition(np.mean(observation[15:200, 30:125], axis=2), action, reward, 
                           np.mean(observation_[15:200, 30:125], axis=2))
    observation = observation_
    brain.learn(batch_size)
    last_action = action
    env.render()
scores.append(score)
print('score: ', score)
x = [i + 1 for i in range(num_games)]
file_name = 'test' + str(num_games) + '.png'

    

    Found GPU0 GeForce GTX 650 which is of cuda capability 3.0.
    PyTorch no longer supports this GPU because it is too old.
    


done initializing memory
staring game  1 epsilon: 1.0000
