In [None]:
import sys
sys.path.append("/home/ubuntu/mlab/days/w3d3/rl_env/lib/python3.8/site-packages")
sys.path.append("/home/ubuntu/mlab/days/w3d3/rl_env/lib/python3.9/site-packages")

In [None]:
import gym
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
from IPython.display import Video
from video_recorder import VideoRecorder
import torch as t
from time import ctime
from datetime import datetime
import rl_tests
from collections import deque

device = "cuda"

In [None]:
def make_choice(
    env: gym.Env, eps: float, net: t.nn.Module, obs: t.Tensor, device: str
):
    
    u = t.rand(1).item()
    num_actions = env.action_space.n
    
    if u < eps:
        return t.randint(high = num_actions, size = (1,)).item()
    
    else:
        with t.no_grad():
            q = net(obs)
            return t.argmax(q).item()

In [None]:
def evaluate(model, env_name, eps: float, device: str, save_video: bool, show_video: bool, verbose: bool, make_atari_wrapper=True):
    with t.no_grad():
        env = gym.make(env_name)
        if make_atari_wrapper:
            env = AtariWrapper(env)
        
        if save_video:
            now = datetime.now()
            time = now.strftime("%m_%d_%Y_%H_%M_%S")
            video_string = f"videos/env_{time}.mp4"
            recorder = VideoRecorder(env, video_string, enabled=save_video)

        state = env.reset()
        done = False
        total_reward = 0

        states = 0

        while not done:
            states += 1

            if save_video:
                recorder.capture_frame()
            if show_video:
                show_state(env)

            obs = state

            # take an epsilon-greedy action
            action = make_choice(env, eps, model, obs, device) 
            state, reward, done, _ = env.step(action)

            total_reward += reward

        if verbose: print(states)
        if verbose: print(f"total reward: {total_reward}")
        if save_video:
            if verbose: print(f"Saving video as {video_string}")
            recorder.close()
        return total_reward

In [None]:
def train_model(model, env_name, epsilon, eps_end, eps_end_idx, eval_eps=0.05, add_atari_wrapper=True):
    env = gym.make(env_name)
    env = AtariWrapper(env)

    replay_buffer = deque()
    obs_new = env.reset()
    optimizer = t.optim.Adam(model.parameters(), lr=adam_lr)
    model.train()
    
    d_eps = (eps_end - epsilon) / eps_end_idx

    for step in range(num_steps):
        if step % eval_interval == 0:
            print(f"Logging hundredth step: {step}")
            
            total_reward = evaluate(model, env_name, eval_eps, device, save_video=False, show_video=False, verbose=False)
            print(f"Evaluating: total reward {total_reward}")
        
        obs = obs_new

        # take an epsilon-greedy action
        action = make_choice(env, epsilon, model, obs, device)
        obs_new, reward, done, _ = env.step(action)

        # store transition in replay buffer
        replay_buffer.append((obs, action, reward, done, obs_new))

        # sample random minibatches from replay buffer
        sample_indices = t.randint(high=len(replay_buffer), size=(batch_size,))
        loss = 0.0
        
        if step % train_freq == 0:
            # print("Doing optimization, step:", step)

            optimizer.zero_grad()
            for i in sample_indices:
                sample_obs, sample_action, sample_reward, sample_done, sample_obs_new = replay_buffer[i]

                y = sample_reward

                if not sample_done:
                    with t.no_grad():
                        y += gamma * t.max(model(sample_obs_new))

                # loss += criterion(model(obs)[action], y)
                q_vals = model(sample_obs)[0]
                loss += (q_vals[sample_action]-y)**2

            loss.backward()
            optimizer.step()

        # limit the number of experiences in the buffer
        if len(replay_buffer) > exp_buffer_size:
            replay_buffer.popleft()
        if done:
            # print('Done', step)
            obs_new = env.reset()
        if step < eps_end_idx:
            epsilon += d_eps
            
    return model

# BreakoutNoFrameSkip-v0

In [None]:
from einops import rearrange

class CNN(t.nn.Module):
    def __init__(self, obs_n_channels, n_action_space, device):
        super().__init__()
        self.device=device
        self.model = t.nn.Sequential(
            t.nn.Conv2d(obs_n_channels, 32, 8, stride=4),
            t.nn.ReLU(),
            t.nn.Conv2d(32, 64, 4, stride=2),
            t.nn.ReLU(),
            t.nn.Conv2d(64, 64, 3, stride=1),
            t.nn.ReLU(),
            t.nn.Flatten(),
            t.nn.Linear(3136, n_action_space)
        )
    
    def forward(self, x):
        return self.model(
            rearrange(t.tensor(x, dtype=t.float32, device=self.device).unsqueeze(0), 'a b c d -> a d b c')
        )

In [None]:
from days.atari_wrappers import AtariWrapper

env_name="BreakoutNoFrameskip-v0"
num_steps=1_000_000
adam_lr=3e-5
gamma=0.99
exp_buffer_size=100_000
train_freq = 4
batch_size = 32
eps_start = 1
eps_end = 0.01
eps_end_idx = num_steps
eval_interval = 100

In [115]:
env = gym.make("BreakoutNoFrameskip-v0")
env = AtariWrapper(env)
state = env.reset()
cnn = CNN(state.shape[-1], env.action_space.n, device=device).to(device)

In [None]:
cnn = train_model(cnn, env_name, epsilon=eps_start, eps_end=eps_end, eps_end_idx=num_steps)

Logging hundredth step: 0
Evaluating: total reward 2.0
Logging hundredth step: 100
Evaluating: total reward 2.0
Logging hundredth step: 200
Evaluating: total reward 0.0
Logging hundredth step: 300
Evaluating: total reward 0.0
Logging hundredth step: 400
Evaluating: total reward 0.0
Logging hundredth step: 500
Evaluating: total reward 0.0
Logging hundredth step: 600
Evaluating: total reward 1.0
Logging hundredth step: 700
Evaluating: total reward 1.0
Logging hundredth step: 800
Evaluating: total reward 2.0
Logging hundredth step: 900
Evaluating: total reward 0.0
Logging hundredth step: 1000
Evaluating: total reward 2.0
Logging hundredth step: 1100
Evaluating: total reward 0.0
Logging hundredth step: 1200
Evaluating: total reward 0.0
Logging hundredth step: 1300
Evaluating: total reward 0.0
Logging hundredth step: 1400
Evaluating: total reward 0.0
Logging hundredth step: 1500
Evaluating: total reward 0.0
Logging hundredth step: 1600
Evaluating: total reward 2.0
Logging hundredth step: 17

In [None]:
env = gym.make(env_name)
state = env.reset()
done = False

while not done:
    states += 1
    # if record:
        # recorder.capture_frame()
    # else:  
    show_state(env)
    # if record:
    #     recorder.capture_frame()
    state, reward, done, _ = env.step(env.action_space.sample()) # Take a random action
    total_reward += reward