In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!apt-get install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*
!pip install gym[box2d]==0.17.* 
import pyvirtualdisplay


_display = pyvirtualdisplay.Display(visible=False,  # use False with Xvfb
                                    size=(1400, 900))
_ = _display.start()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libxxf86dga1
Suggested packages:
  mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 x11-utils xvfb
0 upgraded, 3 newly installed, 0 to remove and 40 not upgraded.
Need to get 994 kB of archives.
After this operation, 2,981 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.9 [784 kB]
Fetched 994 kB in 1s (1,386 kB/s)
Selecting previously unselected package libxxf86dga1:amd64.
(Reading database ... 160837 files and directories currently installed.)
Preparing to unpack .../libxxf86dga1_2%3a1.1.4-1_amd64.deb ...
Unpacking libxxf86dga1:amd64 (2:

In [3]:
!echo $DISPLAY

:1001


In [7]:
import gym 
import numpy as np 
import pickle
import os 
from datetime import datetime
import gzip
import json

ENV_NAME = 'CarRacing-v0'
DISC_ACTION_SPACE = np.array([[-1., 0. , 0.], [-1., 0., 0.8], [-1., 1., 0.], [0., 0., 0.], [0., 0., 0.8], [0., 1., 0.], [1., 0., 0.], [1., 1., 0.]]) #Discretized action space

In [4]:
import cv2

def _to_gray_scale(rgb, channel_weights=[0.15, 0.8, 0.05]):
    return np.float32(np.array([np.dot(rgb[..., :3], channel_weights)]))

def preprocessing(batch):
    pbatch = [] #stores preprocessed batch

    for i in range(0, batch.shape[0]):
      tmp = batch[i][0:84, 6:90]
      tmp = _to_gray_scale(tmp)/255
      pbatch.append(tmp)

    return np.array(pbatch) 

In [5]:
def load_data(directory = "./data", val_split = 0.1):
    """
    Loads the data saved after expert runs.
    Input : directory where data.pkl.gzip is located, val_split
    Output : X_train, Y_train, X_val, Y_val (training and validations sets with split determined by `val_split`)
    """
    data_file = os.path.join(directory, 'data.pkl.gzip')
    
    file =  gzip.open(data_file, 'rb')
    data = pickle.load(file)
        
    X = np.array(data["state"]).astype('float32')
    y = np.array(data["action"]).astype('float32')

    # split data into training and validation sets
    num_samples = len(data["state"])
    val_len = int(val_split*num_samples)
    X_train, y_train = X[:-val_len], y[:-val_len]
    X_val, y_val = X[-val_len:], y[-val_len:]
    return X_train, y_train, X_val, y_val

In [8]:
X_train, y_train, X_val, y_val = load_data(directory="/content/drive/MyDrive/RL/Intro-to-RL-2021/ImitationLearning/data", val_split=0.05)

In [9]:
import torch
import torch.nn as nn
import tensorflow as tf
import cv2

def conv2d_size_out(size, kernel_size, stride):
    return (size - (kernel_size - 1) - 1) // stride  + 1

class ILAgent(nn.Module):
    def __init__(self, state_shape, n_actions):

        super().__init__()
        self.n_actions = n_actions
        self.state_shape = state_shape
        length = state_shape[1]

        self.ReLU = nn.ReLU()
        self.flatten = nn.Flatten()
        self.conv1 = nn.Conv2d(1, 16, kernel_size = 3, stride = 2)
        length = conv2d_size_out(length, 3, 2)
        self.conv2 = nn.Conv2d(16, 24, kernel_size = 3, stride = 2)
        length = conv2d_size_out(length, 3, 2)
        self.lin1 = nn.Linear(24*length*length, 128)
        self.lin2 = nn.Linear(128, n_actions)

    def get_action(self, state_t):
        state_t = torch.from_numpy(state_t)
        action = state_t
        action = self.conv1(action)
        action = self.ReLU(action)
        action = self.conv2(action)
        action = self.ReLU(action)
        action = self.flatten(action)
        action = self.lin1(action)
        action = self.ReLU(action)
        action = self.lin2(action)

        return action

In [10]:
def output_preprocessing(output):
  result = []
  for j in range(0, output.shape[0]):
    tmp = []  
    for i in range(0, 8):
      tmp.append(1 if np.array_equal(y_train[j], DISC_ACTION_SPACE[i]) else 0)
    result.append(tmp)
  return torch.from_numpy(np.array(result))

In [11]:
agent = ILAgent((1, 84, 84), len(DISC_ACTION_SPACE))

In [12]:
def compute_loss(agent, inputs, outputs):
  actions = agent.get_action(inputs)
  loss = torch.mean((actions - outputs)**2)
  return loss

In [13]:
opt = torch.optim.Adam(agent.parameters(), lr=1e-3)

In [14]:
def train_naive(agent, num_iterations, X_train, y_train):
  batch = preprocessing(X_train)
  outputs = output_preprocessing(y_train)
  for i in range(0, num_iterations):
    loss = compute_loss(agent, batch, outputs)
    loss.backward()
    opt.step()
    opt.zero_grad()

In [29]:
train_naive(agent, 200, X_train, y_train)

In [16]:
def run_episode(env, agent, rendering=True, max_timesteps=2000):
    
    episode_reward = 0
    step = 0
    render_mode = 'human' if rendering else 'rgb_array'

    state = env.reset()
    while True:
        
        # preprocess
        state = preprocessing(state.reshape(1, 96, 96, 3))

        # get action
        a_set = agent.get_action(state)
        a = DISC_ACTION_SPACE[np.argmax(np.array(a_set.detach()).squeeze())]

        next_state, r, done, info = env.step(a)   
        episode_reward += r       
        state = next_state
        step += 1
        env.render(mode = render_mode)

        if done or step > max_timesteps: 
            break

    return episode_reward, step

In [None]:
env = gym.make('CarRacing-v0').unwrapped

episode_rewards = []
steps = []
for i in range(5):
    episode_reward, step = run_episode(env, agent, rendering=True)
    episode_rewards.append(episode_reward)
    steps.append(step)

    # save results in a dictionary and write them into a .json file
results = dict()
results["episode_rewards"] = episode_rewards
results["mean"] = np.array(episode_rewards).mean()
results["std"] = np.array(episode_rewards).std()



Track generation: 1241..1563 -> 322-tiles track
Track generation: 1154..1447 -> 293-tiles track
Track generation: 1039..1303 -> 264-tiles track
Track generation: 1065..1345 -> 280-tiles track
Track generation: 984..1243 -> 259-tiles track


In [37]:
import gym.wrappers

with gym.wrappers.Monitor(gym.make('CarRacing-v0'), directory="videos", force=True) as env_monitor:
    sessions = [run_episode(env_monitor, agent, rendering=False, max_timesteps=2000)]



Track generation: 960..1205 -> 245-tiles track
