In [None]:
!pip3 install gym-retro

Collecting gym-retro
[?25l  Downloading https://files.pythonhosted.org/packages/73/6d/2c9f009663b74bcf66a2306c2b8a819a1ac6b0d3090e342720291b527446/gym_retro-0.7.1-cp36-cp36m-manylinux1_x86_64.whl (162.0MB)
[K     |████████████████████████████████| 162.0MB 99kB/s 
Installing collected packages: gym-retro
Successfully installed gym-retro-0.7.1


In [None]:
!python3 -m retro.import /content/sample_data

Importing FerrariGrandPrixChallenge-Genesis
Imported 1 games


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
import retro
import numpy as np
import matplotlib.pyplot as plt
import sys
import torch
from torch import nn
from torch import optim
import torchvision.models
import torchvision
import gym
import torch.nn.functional as F
from math import floor
import time
from torch.distributions import Categorical


In [None]:
seed = 10
torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
# Hyperparameters
LEARNING_RATE       = 1e-3
GAMMA               = 0.9 # discount factor for future rewards
LAMBDA              = 0.95 # lambda used in advantage estimation
PPO_EPSILON         = 0.2 # to prevent large changes to the policy
CRITIC_DISCOUNT     = 0.5 # reducing the weight of the critic error when computing total network loss
ENTROPY_BETA        = 0.01 # factor to reward exploration over exploitation
MINI_BATCH_SIZE     = 256 
PPO_EPOCHS          = 3 # number of times to update weights using the same data
HIDDEN_LAYERS       = [256, 32] 

In [None]:
class Discretizer(gym.ActionWrapper):
    """
    Wrap a gym environment and make it use discrete actions.
    Args:
        combos: ordered list of lists of valid button combinations
    """

    def __init__(self, env, combos):
        super().__init__(env)
        assert isinstance(env.action_space, gym.spaces.MultiBinary)
        buttons = env.unwrapped.buttons
        self._decode_discrete_action = []
        for combo in combos:
            arr = np.array([False] * env.action_space.n)
            for button in combo:
                arr[buttons.index(button)] = True
            self._decode_discrete_action.append(arr)
        self.action_space = gym.spaces.Discrete(len(self._decode_discrete_action))

    def action(self, act):
        return self._decode_discrete_action[act].copy()

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, num_actions, hidden = HIDDEN_LAYERS):
        super(ActorCritic, self).__init__()
        self.input_size = 2048 # for wide resnet
        self.n_outputs = num_actions
        self.pick_action = nn.Sequential(
            nn.Linear(self.input_size, hidden[0]),
            nn.ReLU(),
            nn.Linear(hidden[0], hidden[1]),
            nn.ReLU(),
            nn.Linear(hidden[1], self.n_outputs)
        )
        self.evaluate = nn.Sequential(
            nn.Linear(self.input_size, hidden[0]),
            nn.ReLU(),
            nn.Linear(hidden[0], hidden[1]),
            nn.ReLU(),
            nn.Linear(hidden[1], 1)
        )

    def forward(self, x):
        x = x.view(-1, self.input_size)
        return self.pick_action(x), self.evaluate(x)


In [None]:
# Using the pretrained Wide Resnet 50 model to extract features
resnet = torchvision.models.wide_resnet50_2(pretrained=True)

# Keeping all but the last (fully-connected) layer from the pretrained resnet model
modules=list(resnet.children())[:-1]
resnet=nn.Sequential(*modules)

# freezing the parameters so they don't get changed during training
for p in resnet.parameters():
    p.requires_grad = False

Downloading: "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth" to /root/.cache/torch/checkpoints/wide_resnet50_2-95faca4d.pth


HBox(children=(IntProgress(value=0, max=138223492), HTML(value='')))




In [None]:
def transform_image(image):
    # transforming the image so that it's in the form that the pretrained resnet model expects
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    new_img = image.transpose((2,0,1))
    new_img = (new_img-new_img.min())/(new_img.max() - new_img.min())
    new_img = torch.Tensor(new_img).to(device)    
    new_img = torchvision.transforms.functional.normalize(new_img, mean, std, inplace=True)

    # extracting and returning the features
    out = resnet(new_img.unsqueeze_(0))
    return out

In [None]:
def normalize(x):
    x -= x.mean()
    x /= (x.std() + 1e-8)
    return x

In [None]:
# Generalized Advantage Estimation is the state-of-the-art method to determine how "good" the choice of action is
def GAE(next_value, rewards, masks, values):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        #  formula from the GAE paper
        delta = rewards[step] + GAMMA * values[step + 1] * masks[step] - values[step]
        gae = delta + GAMMA * LAMBDA * masks[step] * gae
        
        # insert at the beginning to get correct order back
        returns.insert(0, gae + values[step])
    return returns

In [None]:
def iterate(states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    # generates random mini-batches from the collected data until the entire batch has been covered
    for _ in range(batch_size//MINI_BATCH_SIZE):
        rand_ids = np.random.randint(0, batch_size, MINI_BATCH_SIZE)
        yield states[rand_ids, :], actions[rand_ids], log_probs[rand_ids], returns[rand_ids], advantage[rand_ids]

In [None]:
def train(states, actions, log_probs, returns, advantages):
    # PPO EPOCHS is the number of times we go through all the collected data to make updates
    for _ in range(PPO_EPOCHS):
        # grabs random mini-batches several times until we have covered all data
        for state, action, old_log_probs, return_, advantage in iterate(states, actions, log_probs, returns, advantages):
            act, value = model(state)
            dist = Categorical(F.softmax(act, dim=-1))
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)

            prob_ratio = (new_log_probs - old_log_probs).exp()
            
            # PPO loss for the actor
            actor_loss  = - torch.min(prob_ratio * advantage, torch.clamp(prob_ratio, 1.0 - PPO_EPSILON, 1.0 + PPO_EPSILON) * advantage).mean()
            
            #  Regular MSE loss for the critic
            critic_loss = (return_ - value).pow(2).mean()

            # total loss for the network is the critic loss weighted by a decreasing factor + actor loss 
            # + entropy weighted by a factor (the entropy is there to encourage exploration - ie. not converging to one prediction too soon)
            loss = CRITIC_DISCOUNT * critic_loss + actor_loss - ENTROPY_BETA * entropy

            # optimize parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


In [None]:
# try with using a CNN instead of the resnet
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print('Device:', device)

env = retro.make(game='FerrariGrandPrixChallenge-Genesis')
env = Discretizer(env, combos=[['A'], ['B'], ['LEFT', 'A'], ['RIGHT', 'A']])
num_outputs = env.action_space.n

model = ActorCritic(num_outputs)
# model.load_state_dict(torch.load("/content/gdrive/My Drive/APS360 Project/PPO Models/PPO_A2C_39_1100225.0_-0.005999565124511719"))
model.to(device)
resnet.to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

best_reward = None
best_advantage = None

# all_rewards = np.load("/content/gdrive/My Drive/APS360 Project/PPO Models/all_rewards.npy")
all_rewards = np.array([])
total_iters = len(all_rewards)

while True:
    state = env.reset()
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    done = False
    total_rewards = 0
    total_adv = 0
    prevSpeed = 0

    i = 0

    while not done:           
        state = transform_image(state)

        act, value = model(state)

        if total_iters %2 == 1:
            dist = Categorical(F.softmax(act, dim=-1))
        else:
            probs = (1/4)*np.ones(4)
            probs = torch.tensor(probs, device="cuda:0", requires_grad=True)
            dist = Categorical(probs)

        action = dist.sample()
        log_prob = dist.log_prob(action)
        next_state, reward, done, info = env.step(action.item())
        
        speed = info['speed'] 
        screen = env.get_screen()

        if speed > 200:
            reward = torch.tensor([4], dtype=torch.long, device=device)
        elif speed > 150:
            reward = torch.tensor([3], dtype=torch.long, device=device)
        elif speed > 100:
            reward = torch.tensor([2], dtype=torch.long, device=device)
        elif speed > 75:
            reward = torch.tensor([1], dtype=torch.long, device=device)
        elif speed > 40:
            reward = torch.tensor([0], dtype=torch.long, device=device)
        else:
            reward = torch.tensor([-2], dtype=torch.long, device=device)
                
        states.append(state)
        actions.append(action)
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.as_tensor(reward).unsqueeze(-1).to(device))
        masks.append(torch.as_tensor(1 - done).unsqueeze(-1).to(device))
        
        state = next_state
        prevSpeed = speed

        if i % 2000 == 0:
            print("i: {}, Action: {}, Reward: {}, Speed: {}".format(i, action.item(), reward, speed))

        i += 1
              
    next_state = transform_image(next_state)
    _, next_value = model(next_state)
    returns = GAE(next_value, rewards, masks, values)
    returns = torch.stack(returns).detach()
    log_probs = torch.stack(log_probs).detach()
    values    = torch.stack(values).detach()
    states    = torch.stack(states)
    actions   = torch.stack(actions)
    advantage = returns - values
    advantage = normalize(advantage)

    train(states, actions, log_probs, returns, advantage)

    total_rewards += sum(rewards)
    total_adv += sum(advantage)
    total_iters += 1
    
    print("Iteration: {}, Reward = {}, Advantage = {}".format(total_iters, total_rewards.item(), total_adv.item()))
    print(info)

    all_rewards = np.append(all_rewards, total_rewards.item())
    np.save("/content/gdrive/My Drive/APS360 Project/PPO Models/all_rewards.npy", all_rewards)
    
    if best_reward is None or best_reward < total_rewards or best_advantage is None or best_advantage < total_adv:
        best_reward = total_rewards
        best_advantage = total_adv
        torch.save(model.state_dict(), "/content/gdrive/My Drive/APS360 Project/PPO Models/PPO_A2C_{}_{}_{}".format(total_iters, total_rewards.item(), total_adv.item()))

    

Device: cuda
i: 0, Action: 2, Reward: tensor([-2], device='cuda:0'), Speed: 0
i: 2000, Action: 2, Reward: tensor([1], device='cuda:0'), Speed: 81
i: 4000, Action: 1, Reward: tensor([0], device='cuda:0'), Speed: 67
i: 6000, Action: 2, Reward: tensor([0], device='cuda:0'), Speed: 75
i: 8000, Action: 3, Reward: tensor([1], device='cuda:0'), Speed: 94
i: 10000, Action: 3, Reward: tensor([-2], device='cuda:0'), Speed: 9
i: 12000, Action: 3, Reward: tensor([0], device='cuda:0'), Speed: 41
i: 14000, Action: 0, Reward: tensor([1], device='cuda:0'), Speed: 92
i: 16000, Action: 0, Reward: tensor([1], device='cuda:0'), Speed: 90
i: 18000, Action: 3, Reward: tensor([-2], device='cuda:0'), Speed: 23
i: 20000, Action: 3, Reward: tensor([-2], device='cuda:0'), Speed: 8
i: 22000, Action: 0, Reward: tensor([-2], device='cuda:0'), Speed: 9
i: 24000, Action: 1, Reward: tensor([-2], device='cuda:0'), Speed: 22
Iteration: 1, Reward = -24508, Advantage = 0.006847381591796875
{'best_lap': 15958, 'lap': 4, 's

In [None]:
env.close()