<a href="https://colab.research.google.com/github/razvancraciun/space-invaders-ai/blob/master/space.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
TRAIN = True
LOAD = False
LOAD_INDEX = 0


# learning params
EPISODES = 3000
BATCH_SIZE = 64  # <! 256
LEARNING_RATE = 0.001
SAVE_PATH = '/content/drive/My Drive/save10/'
SAVE_INTERVAL = 50

# agent params
GAMMA = 0.99
EPSILON = 1
EPSILON_MIN = 0.03
EPSILON_DEC = 0.9995
REPLACE_INTERVAL = 1000

FRAME_STACK_SIZE = 3
SKIP_FRAMES = 3
BUFFER_SIZE = 50000

## Imports

In [0]:
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
import gym
import matplotlib.pyplot as plt
from IPython import display

## Buffer

In [0]:
class ReplayBuffer:
    def __init__(self, size, state_shape, n_actions):
        self.size = size
        self.count = 0

        self.from_stack = torch.zeros( (self.size, *state_shape), dtype=torch.float16)
        self.to_stack = torch.zeros((self.size, *state_shape), dtype=torch.float16)
        self.actions = torch.zeros(self.size, dtype=torch.long)
        self.rewards = torch.zeros(self.size, dtype=torch.int16)
        self.terminals = torch.zeros(self.size, dtype=torch.int8)

    def store(self, from_stack, action, reward, to_stack, done):
        index = self.count % self.size
        
        self.from_stack[index] = torch.Tensor(from_stack)
        self.to_stack[index] = torch.Tensor(to_stack)
        self.actions[index] = torch.Tensor([action])
        self.rewards[index] = reward
        self.terminals[index] = 1 - int(done)
        self.count += 1

    def sample(self, batch_size):
        size = min(self.count, self.size)
        batch = np.random.choice(size, batch_size)

        from_states = self.from_stack[batch]
        to_states = self.to_stack[batch]
        actions = self.actions[batch]
        rewards = self.rewards[batch]
        terminals = self.terminals[batch]

        return from_states, actions, rewards, to_states, terminals 

## Model

In [0]:
class Model(nn.Module):
    def __init__(self, input_channels, output_shape, learning_rate):
        super(Model, self).__init__()
       
        self.conv1 = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU())
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=8, stride=2, padding=0),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )

        self.fc1 = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64*7*7, 256),
            nn.BatchNorm1d(256),
            nn.ReLU()
        )

        self.fc2 = nn.Sequential(
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU()
        )

        self.fc3 = nn.Sequential(
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.ReLU()
        )

        self.fc4 = nn.Sequential(
            nn.Linear(128, output_shape),
            nn.Softmax(dim=1)
        )
            
        self.optimizer = torch.optim.Adam(self.parameters(), learning_rate)
        self.loss = nn.MSELoss()

    def forward(self, x):
        y = self.conv1(x.float())
        y = self.conv2(y)
        y = self.fc1(y)
        y = self.fc2(y)
        y = self.fc3(y)
        y = self.fc4(y)
        if torch.isnan(y).any():
            print(y)
        return y


## Frame handling

In [0]:
def add_frame(frame):
    frame = preprocess(frame)
    stack.pop(0)
    stack.append(frame)


def init_stack(frame):
    frame = preprocess(frame)
    for _ in range(FRAME_STACK_SIZE):
        stack.append(frame)


def preprocess(state):
    state = np.moveaxis(state, 2, 0)
    r,g,b = state[0], state[1], state[2]
    state = 0.3 * r + 0.59 * g + 0.11 * b
    state = state[20:-14, 15:-15]
    state = Image.fromarray(state)
    state = state.resize((84,84))
    state = np.array(state)
    state /= 255
    return state

## Agent

In [0]:
class Agent:
    def __init__(self, n_actions):
        self.epsilon = EPSILON
        self.epsilon_min = EPSILON_MIN
        self.epsilon_dec = EPSILON_DEC
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.action_space = range(n_actions)
        print('Allocating model...')
        self.eval_model = Model(FRAME_STACK_SIZE, n_actions, LEARNING_RATE)
        self.target_model = Model(FRAME_STACK_SIZE, n_actions, LEARNING_RATE)
        print('Done')
        print('Allocating buffer...')
        self.buffer = ReplayBuffer(BUFFER_SIZE, (FRAME_STACK_SIZE, 84, 84), n_actions)
        print('Done')
        self.device = default_device()
        self.eval_model = self.eval_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.train_index = 0

    def choose_action(self, stack):
        stack = torch.Tensor(stack)
        stack.unsqueeze_(0)
        rand = np.random.rand()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.eval_model.forward(stack.clone().to(self.device))
            _, action = torch.max(actions,1)
            action = action.data.tolist()[0]
        return action


    def train(self):
        if self.buffer.count < self.batch_size:
            return
        self.train_index += 1
        self.replace_models(self.train_index)

        self.eval_model.optimizer.zero_grad()

        from_states, actions, rewards, to_states, terminals = self.buffer.sample(self.batch_size)

        from_states = from_states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        to_states = to_states.to(self.device)
        terminals = terminals.to(self.device)

        q_eval = self.eval_model.forward(to_states).to(self.device)
        q_next = self.target_model.forward(to_states).to(self.device)

        q_pred = self.eval_model.forward(from_states).to(self.device)

        q_target = q_pred.clone()
        batch_index = torch.arange(0, self.batch_size, dtype=torch.long)
        max_next, _ = q_next.max(dim=1)
        q_target[batch_index, actions] = (rewards + self.gamma * max_next * terminals).to(self.device)
        

        cost = self.eval_model.loss(q_pred, q_target)
        cost.backward()
        self.eval_model.optimizer.step()

        self.epsilon = self.epsilon * self.epsilon_dec \
            if self.epsilon > self.epsilon_min else self.epsilon_min


    def replace_models(self, index):
        if index % REPLACE_INTERVAL == 0 and index != 0:
            self.target_model.load_state_dict(self.eval_model.state_dict())

    def save_model(self, index):
        torch.save(self.eval_model, SAVE_PATH + f'eval_model{index}.pt')
        torch.save(self.target_model, SAVE_PATH + f'target_model{index}.pt')


    def load_model(self, index):
        self.eval_model = torch.load(SAVE_PATH + f'eval_model{index}.pt', map_location=default_device())
        self.target_model = torch.load(SAVE_PATH + f'target_model{index}.pt', map_location=default_device())
        self.eval_model.eval()


def default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')


   

## Utils

In [0]:
def log(scores, index):
	f = open(SAVE_PATH + f'log{index}.txt', 'w')
	f.write(str(scores))
	f.close()

## Main

In [0]:
env = gym.make('SpaceInvaders-v0')
agent = Agent(env.action_space.n)
if LOAD:
    print(f'Loading checkpoint with index {LOAD_INDEX}')
    agent.load_model(LOAD_INDEX)
    agent.epsilon = EPSILON_MIN
start = LOAD_INDEX + 1 if LOAD else 0

if TRAIN:
    print('Training...')
    scores = []
    for episode in range(start, EPISODES+start):
        stack = []
        init_stack(env.reset())
        done = False
        score = 0
        step = 0
        added_reward = 0
        while not done:
            if step % SKIP_FRAMES == 0:
                agent.eval_model = agent.eval_model.eval()
                action = agent.choose_action(stack)

            new_state, reward, done, info = env.step(action)
            step += 1
            score += reward
            if done and info['ale.lives'] == 0:
                reward = -1000
            added_reward += reward

            if step % SKIP_FRAMES == 0:
                old_stack = np.copy(stack)
                add_frame(new_state)
                agent.buffer.store(old_stack, action, added_reward, stack, done)
                added_reward = 0
                agent.eval_model = agent.eval_model.train()
                agent.train()
            

        print(f'episode:{episode} score:{score}')
        scores.append(score)
        if episode % SAVE_INTERVAL == 0 and episode != 0:
            agent.save_model(episode)
            log(scores, episode)
            scores = []
else:
    print('Testing...')
    agent.epsilon = 0
    stack = []
    init_stack(env.reset())
    done = False
    score = 0
    img = plt.imshow(env.render(mode='rgb_array'))
    while not done:
        img.set_data(env.render(mode='rgb_array'))
        display.display(plt.gcf())
        display.clear_output(wait=True)
        action = agent.choose_action(stack)
        new_state, reward, done, info = env.step(action)
        score += reward
        old_stack = np.copy(stack)
    print(f'Done! Score:{score}')



Allocating model...
Done
Allocating buffer...
Done
Training...
episode:0 score:20.0
episode:1 score:20.0
episode:2 score:175.0
episode:3 score:105.0
episode:4 score:95.0
episode:5 score:170.0
episode:6 score:55.0


In [0]:
# env = gym.make('SpaceInvaders-v0')
# done = False
# state = env.reset()
# while not done:
#     action = 4
#     new_state, reward, done, info = env.step(action)
#     preprocess(state)
#     img = plt.imshow(env.render(mode='rgb_array'))
#     img.set_data(env.render(mode='rgb_array'))
#     display.display(plt.gcf())
#     # display.clear_output(wait=True)
#     state = new_state

In [0]:
from google.colab import drive
drive.mount('/content/drive')