In [1]:
from baselines.common import set_global_seeds
from baselines.common.misc_util import boolean_flag
from baselines.common.schedules import LinearSchedule

import argparse
import scipy.misc
import os, datetime, time, re
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

import gym
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline
from copy import deepcopy
from agents.models_pytorch import dqn_model, qmap_model
from agents.q_map_dqn_agent_pytorch import Q_Map_DQN_Agent

from agents.replay_buffers import DoublePrioritizedReplayBuffer
from envs.custom_mario import CustomSuperMarioAllStarsEnv
from envs.wrappers import PerfLogger
from time import gmtime, strftime

Logging to /tmp/openai-2019-01-07-04-22-34-314811


In [2]:
# sys.argv = []
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--seed', help='random number generator seed', type=int, default=0)
parser.add_argument('--path', default='pytorch_results/' + strftime("%d_%b_%Y_%H_%M_%S", gmtime()))
parser.add_argument('--level', help='game level', default='1.1')
parser.add_argument('--load', help='steps of the models saved in Models folder', default=None)
boolean_flag(parser, 'dqn', default=True)
boolean_flag(parser, 'qmap', default=True)
boolean_flag(parser, 'render', help='play the videos', default=False)
args = parser.parse_args('')

In [3]:
env = CustomSuperMarioAllStarsEnv(screen_ratio=4, coords_ratio=8, use_color=False, use_rc_frame=False,
                                  stack=3, frame_skip=2, action_repeat=4, level=args.level)

coords_shape = env.coords_shape
set_global_seeds(args.seed)
env.seed(args.seed)

mario_level_1_1.state
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.uint8'>. Please provide explicit dtype.[0m


In [4]:
mario_dqn = dqn_model(
        observation_space=env.observation_space.shape,
        conv_params=np.array([(32, 8, 2, 3), (32, 6, 2, 2), (64, 4, 2, 1)]),
        hidden_params=np.array([1024]),        
        layer_norm=True,
        activation_fn = F.relu,
        n_actions=env.action_space.n
    )

print(mario_dqn)

dqn_model(
  (conv): ListModule(
    (0): Conv2d(3, 32, kernel_size=(8, 8), stride=(2, 2), padding=(3, 3))
    (1): Conv2d(32, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  )
  (fc_action): ListModule(
    (0): Linear(in_features=3584, out_features=1024, bias=True)
    (1): Linear(in_features=1024, out_features=6, bias=True)
  )
  (fc_state): ListModule(
    (0): Linear(in_features=3584, out_features=1024, bias=True)
    (1): Linear(in_features=1024, out_features=1, bias=True)
  )
  (normalize_action): ListModule(
    (0): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True)
  )
  (normalize_state): ListModule(
    (0): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True)
  )
)


In [None]:
mario_qmap = qmap_model(
        observation_space=env.observation_space.shape,
        conv_params=np.array([(32, 8, 2, 3), (32, 6, 2, 2), (64, 4, 2, 1)]),
        hidden_params=np.array([1024]),
        deconv_params = np.array([(64, 4, 2, 1), (32, 6, 2, 2), (env.action_space.n, 4, 1, 2)]),
        layer_norm=True,
        activation_fn = F.elu,        
    )
print(mario_qmap)

qmap_model(
  (conv): ListModule(
    (0): Conv2d(3, 32, kernel_size=(8, 8), stride=(2, 2), padding=(3, 3))
    (1): Conv2d(32, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  )
  (fc): ListModule(
    (0): Linear(in_features=3584, out_features=1024, bias=True)
    (1): Linear(in_features=1024, out_features=3584, bias=True)
  )
  (normalize): ListModule(
    (0): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True)
    (1): LayerNorm(torch.Size([3584]), eps=1e-05, elementwise_affine=True)
  )
  (deconv_action): ListModule(
    (0): ConvTranspose2d(64, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (1): ConvTranspose2d(64, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
    (2): ConvTranspose2d(32, 6, kernel_size=(4, 4), stride=(1, 1), padding=(2, 2))
  )
  (deconv_state): ListModule(
    (0): ConvTranspose2d(64, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (

In [None]:
print("CUDA Available: ",torch.cuda.is_available())
device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
mario_dqn = mario_dqn.to(device)
mario_qmap = mario_qmap.to(device)

n_steps = int(5e6)
exploration_schedule = LinearSchedule(schedule_timesteps=n_steps, initial_p=1.0, final_p=0.05)
q_map_random_schedule = LinearSchedule(schedule_timesteps=n_steps, initial_p=0.1, final_p=0.05)
double_replay_buffer = DoublePrioritizedReplayBuffer(int(5e5), alpha=0.6, epsilon=1e-6, 
                                                     timesteps=n_steps, initial_p=0.4, final_p=1.0)
task_gamma = 0.99

CUDA Available:  True


In [None]:
agent = Q_Map_DQN_Agent(    
    n_actions=env.action_space.n, 
    coords_shape=env.unwrapped.coords_shape,
    double_replay_buffer=double_replay_buffer,
    task_gamma=task_gamma,
    exploration_schedule=exploration_schedule,
    seed=args.seed,
    path=args.path,
    learning_starts=1000,
    train_freq=4,
    print_freq=1,    
    renderer_viewer=True,
    # DQN
    dqn_model= mario_dqn,
    dqn_lr=1e-4,
    dqn_optim_iters=1,
    dqn_batch_size=32,
    dqn_target_net_update_freq=1000,
    dqn_grad_norm_clip=1000,
    #QMAP
    q_map_model=mario_qmap,
    q_map_random_schedule=q_map_random_schedule,
    q_map_greedy_bias=0.5,
    q_map_timer_bonus=0.5, # 50% more time than predicted
    q_map_lr=3e-4,
    q_map_gamma=0.9,
    q_map_n_steps=1,
    q_map_batch_size=32,
    q_map_optim_iters=1,
    q_map_target_net_update_freq=1000,
    q_map_min_goal_steps=15,
    q_map_max_goal_steps=30,
    q_map_grad_norm_clip=1000
)
if args.load is not None:
    agent.load(args.path, args.load)

[CSVLogger] logging ['steps', 'planned exploration', 'current exploration', 'random actions', 'goal actions', 'greedy actions'] in pytorch_results/07_Jan_2019_04_22_34/exploration.csv


In [None]:
env = PerfLogger(env, agent.task_gamma, agent.path)
done = True
episode = 0
score = None
best_score = -1e6
best_distance = -1e6
previous_time = time.time()
last_ips_t = 0

for t in range(n_steps+1):        
    
    if done:
        new_best = False
        if episode > 0:
            if score >= best_score:
                best_score = score
                new_best = True
            distance = env.unwrapped.full_c
            if distance >= best_distance:
                best_distance = distance
                new_best = True

        if episode > 0 and (episode < 50 or episode % 10 == 0 or new_best):
            current_time = time.time()
            ips = (t - last_ips_t) / (current_time - previous_time)
            print('step: {} IPS: {:.2f}'.format(t+1, ips))
            name = 'score_%08.3f'%score + '_distance_' + str(distance) + '_steps_' + str(t+1) + '_episode_' + str(episode)                        
            agent.renderer.render(name)
            previous_time = current_time
            last_ips_t = t
        else:            
            agent.renderer.reset()
            
        episode += 1
        score = 0

        ob = env.reset()
        ac = agent.reset(ob)
        
    ob, rew, done, _ = env.step(ac)        
    score += rew                            
    
    ac = agent.step(ob, rew, done)    

env.close()

[CSVLogger] logging ['steps', 'undiscounted return', 'discounted return', 'episode length'] in pytorch_results/07_Jan_2019_04_22_34/score.csv
step: 439 IPS: 46.65
[renderer] preparing next video... 0 in the queue
---------------------------------------
| episodes                | 1         |
| exploration (current)   | 100.000 % |
| exploration (target)    | 99.992 %  |
| mean 100 episode reward | 0.000     |
| steps                   | 438       |
---------------------------------------
created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0000.000_distance_104_steps_439_episode_1.mp4
created visits numpy file pytorch_results/07_Jan_2019_04_22_34/visits/score_0000.000_distance_104_steps_439_episode_1.npy
created coordinates numpy file pytorch_results/07_Jan_2019_04_22_34/coords/score_0000.000_distance_104_steps_439_episode_1.npy
step: 679 IPS: 30.73
[renderer] preparing next video... 0 in the queue
---------------------------------------
| episodes                | 2        

created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0011.000_distance_857_steps_13197_episode_12.mp4
created visits numpy file pytorch_results/07_Jan_2019_04_22_34/visits/score_0011.000_distance_857_steps_13197_episode_12.npy
created coordinates numpy file pytorch_results/07_Jan_2019_04_22_34/coords/score_0011.000_distance_857_steps_13197_episode_12.npy
step: 15614 IPS: 18.72
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 13       |
| exploration (current)   | 97.949 % |
| exploration (target)    | 99.703 % |
| mean 100 episode reward | 6.269    |
| steps                   | 15613    |
--------------------------------------
created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0016.000_distance_638_steps_15614_episode_13.mp4
created visits numpy file pytorch_results/07_Jan_2019_04_22_34/visits/score_0016.000_distance_638_steps_15614_episode_13.npy
created coordinates numpy file pytorch_results/0

step: 23092 IPS: 15.50
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 24       |
| exploration (current)   | 95.984 % |
| exploration (target)    | 99.561 % |
| mean 100 episode reward | 4.146    |
| steps                   | 23091    |
--------------------------------------
created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0000.000_distance_272_steps_23092_episode_24.mp4
created visits numpy file pytorch_results/07_Jan_2019_04_22_34/visits/score_0000.000_distance_272_steps_23092_episode_24.npy
created coordinates numpy file pytorch_results/07_Jan_2019_04_22_34/coords/score_0000.000_distance_272_steps_23092_episode_24.npy
step: 25494 IPS: 19.72
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 25       |
| exploration (current)   | 99.088 % |
| exploration (target)    | 99.516 % |
| mean 100 episode reward | 4.060    |
| steps           

created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0000.000_distance_266_steps_32028_episode_35.mp4
created visits numpy file pytorch_results/07_Jan_2019_04_22_34/visits/score_0000.000_distance_266_steps_32028_episode_35.npy
created coordinates numpy file pytorch_results/07_Jan_2019_04_22_34/coords/score_0000.000_distance_266_steps_32028_episode_35.npy
step: 32371 IPS: 18.75
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 36       |
| exploration (current)   | 98.546 % |
| exploration (target)    | 99.385 % |
| mean 100 episode reward | 3.708    |
| steps                   | 32370    |
--------------------------------------
created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0000.000_distance_144_steps_32371_episode_36.mp4
created visits numpy file pytorch_results/07_Jan_2019_04_22_34/visits/score_0000.000_distance_144_steps_32371_episode_36.npy
created coordinates numpy file pytorch_results/0

step: 48242 IPS: 18.28
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 47       |
| exploration (current)   | 97.695 % |
| exploration (target)    | 99.083 % |
| mean 100 episode reward | 3.745    |
| steps                   | 48241    |
--------------------------------------
step: 48385 IPS: 12.61
--------------------------------------
| episodes                | 48       |
| exploration (current)   | 98.126 % |
| exploration (target)    | 99.081 % |
| mean 100 episode reward | 3.667    |
| steps                   | 48384    |
--------------------------------------
created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0006.000_distance_864_steps_48242_episode_47.mp4
created visits numpy file pytorch_results/07_Jan_2019_04_22_34/visits/score_0006.000_distance_864_steps_48242_episode_47.npy
created coordinates numpy file pytorch_results/07_Jan_2019_04_22_34/coords/score_0006.000_distance_864_steps_48242_episo

--------------------------------------
| episodes                | 68       |
| exploration (current)   | 98.537 % |
| exploration (target)    | 98.607 % |
| mean 100 episode reward | 4.066    |
| steps                   | 73318    |
--------------------------------------
--------------------------------------
| episodes                | 69       |
| exploration (current)   | 98.180 % |
| exploration (target)    | 98.593 % |
| mean 100 episode reward | 4.022    |
| steps                   | 74069    |
--------------------------------------
step: 74413 IPS: 19.28
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 70       |
| exploration (current)   | 99.293 % |
| exploration (target)    | 98.586 % |
| mean 100 episode reward | 3.993    |
| steps                   | 74412    |
--------------------------------------
created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0002.000_distance_286_steps_74413_episode_70

--------------------------------------
| episodes                | 94       |
| exploration (current)   | 97.762 % |
| exploration (target)    | 98.016 % |
| mean 100 episode reward | 3.702    |
| steps                   | 104409   |
--------------------------------------
--------------------------------------
| episodes                | 95       |
| exploration (current)   | 99.299 % |
| exploration (target)    | 98.012 % |
| mean 100 episode reward | 3.663    |
| steps                   | 104645   |
--------------------------------------
--------------------------------------
| episodes                | 96       |
| exploration (current)   | 99.480 % |
| exploration (target)    | 98.009 % |
| mean 100 episode reward | 3.625    |
| steps                   | 104789   |
--------------------------------------
--------------------------------------
| episodes                | 97       |
| exploration (current)   | 98.833 % |
| exploration (target)    | 97.963 % |
| mean 100 episode reward

created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0000.000_distance_720_steps_130494_episode_120.mp4
created visits numpy file pytorch_results/07_Jan_2019_04_22_34/visits/score_0000.000_distance_720_steps_130494_episode_120.npy
created coordinates numpy file pytorch_results/07_Jan_2019_04_22_34/coords/score_0000.000_distance_720_steps_130494_episode_120.npy
--------------------------------------
| episodes                | 121      |
| exploration (current)   | 96.932 % |
| exploration (target)    | 97.475 % |
| mean 100 episode reward | 3.240    |
| steps                   | 132895   |
--------------------------------------
--------------------------------------
| episodes                | 122      |
| exploration (current)   | 98.315 % |
| exploration (target)    | 97.429 % |
| mean 100 episode reward | 3.300    |
| steps                   | 135297   |
--------------------------------------
--------------------------------------
| episodes                | 123      |
| 

--------------------------------------
| episodes                | 145      |
| exploration (current)   | 96.326 % |
| exploration (target)    | 96.898 % |
| mean 100 episode reward | 3.530    |
| steps                   | 163284   |
--------------------------------------
--------------------------------------
| episodes                | 146      |
| exploration (current)   | 97.239 % |
| exploration (target)    | 96.861 % |
| mean 100 episode reward | 3.520    |
| steps                   | 165228   |
--------------------------------------
--------------------------------------
| episodes                | 147      |
| exploration (current)   | 95.662 % |
| exploration (target)    | 96.852 % |
| mean 100 episode reward | 3.500    |
| steps                   | 165696   |
--------------------------------------
--------------------------------------
| episodes                | 148      |
| exploration (current)   | 93.004 % |
| exploration (target)    | 96.849 % |
| mean 100 episode reward

--------------------------------------
| episodes                | 171      |
| exploration (current)   | 97.882 % |
| exploration (target)    | 96.394 % |
| mean 100 episode reward | 3.190    |
| steps                   | 189773   |
--------------------------------------
--------------------------------------
| episodes                | 172      |
| exploration (current)   | 98.222 % |
| exploration (target)    | 96.391 % |
| mean 100 episode reward | 3.140    |
| steps                   | 189958   |
--------------------------------------
--------------------------------------
| episodes                | 173      |
| exploration (current)   | 95.248 % |
| exploration (target)    | 96.390 % |
| mean 100 episode reward | 3.110    |
| steps                   | 190010   |
--------------------------------------
--------------------------------------
| episodes                | 174      |
| exploration (current)   | 89.929 % |
| exploration (target)    | 96.344 % |
| mean 100 episode reward

--------------------------------------
| episodes                | 198      |
| exploration (current)   | 92.248 % |
| exploration (target)    | 95.861 % |
| mean 100 episode reward | 3.590    |
| steps                   | 217868   |
--------------------------------------
--------------------------------------
| episodes                | 199      |
| exploration (current)   | 90.018 % |
| exploration (target)    | 95.857 % |
| mean 100 episode reward | 3.540    |
| steps                   | 218070   |
--------------------------------------
step: 219876 IPS: 19.35
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 200      |
| exploration (current)   | 88.010 % |
| exploration (target)    | 95.822 % |
| mean 100 episode reward | 3.580    |
| steps                   | 219875   |
--------------------------------------
--------------------------------------
| episodes                | 201      |
| exploration (current)   | 9

--------------------------------------
| episodes                | 222      |
| exploration (current)   | 95.179 % |
| exploration (target)    | 95.230 % |
| mean 100 episode reward | 4.470    |
| steps                   | 251077   |
--------------------------------------
--------------------------------------
| episodes                | 223      |
| exploration (current)   | 93.265 % |
| exploration (target)    | 95.184 % |
| mean 100 episode reward | 4.570    |
| steps                   | 253494   |
--------------------------------------
--------------------------------------
| episodes                | 224      |
| exploration (current)   | 98.002 % |
| exploration (target)    | 95.138 % |
| mean 100 episode reward | 4.610    |
| steps                   | 255896   |
--------------------------------------
--------------------------------------
| episodes                | 225      |
| exploration (current)   | 86.630 % |
| exploration (target)    | 95.092 % |
| mean 100 episode reward

--------------------------------------
| episodes                | 249      |
| exploration (current)   | 90.923 % |
| exploration (target)    | 94.503 % |
| mean 100 episode reward | 4.875    |
| steps                   | 289341   |
--------------------------------------
step: 291772 IPS: 19.66
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 250      |
| exploration (current)   | 95.452 % |
| exploration (target)    | 94.456 % |
| mean 100 episode reward | 4.915    |
| steps                   | 291771   |
--------------------------------------
--------------------------------------
| episodes                | 251      |
| exploration (current)   | 90.983 % |
| exploration (target)    | 94.452 % |
| mean 100 episode reward | 4.905    |
| steps                   | 291995   |
--------------------------------------
created video pytorch_results/07_Jan_2019_04_22_34/videos/score_0017.000_distance_842_steps_291772_episode_

--------------------------------------
| episodes                | 275      |
| exploration (current)   | 94.920 % |
| exploration (target)    | 93.849 % |
| mean 100 episode reward | 5.350    |
| steps                   | 323723   |
--------------------------------------
--------------------------------------
| episodes                | 276      |
| exploration (current)   | 95.703 % |
| exploration (target)    | 93.832 % |
| mean 100 episode reward | 5.340    |
| steps                   | 324625   |
--------------------------------------
--------------------------------------
| episodes                | 277      |
| exploration (current)   | 89.784 % |
| exploration (target)    | 93.827 % |
| mean 100 episode reward | 5.340    |
| steps                   | 324907   |
--------------------------------------
--------------------------------------
| episodes                | 278      |
| exploration (current)   | 89.303 % |
| exploration (target)    | 93.826 % |
| mean 100 episode reward

--------------------------------------
| episodes                | 301      |
| exploration (current)   | 85.421 % |
| exploration (target)    | 93.312 % |
| mean 100 episode reward | 5.370    |
| steps                   | 351997   |
--------------------------------------
--------------------------------------
| episodes                | 302      |
| exploration (current)   | 92.985 % |
| exploration (target)    | 93.267 % |
| mean 100 episode reward | 5.360    |
| steps                   | 354381   |
--------------------------------------
--------------------------------------
| episodes                | 303      |
| exploration (current)   | 86.537 % |
| exploration (target)    | 93.263 % |
| mean 100 episode reward | 5.320    |
| steps                   | 354595   |
--------------------------------------
--------------------------------------
| episodes                | 304      |
| exploration (current)   | 90.684 % |
| exploration (target)    | 93.217 % |
| mean 100 episode reward

--------------------------------------
| episodes                | 328      |
| exploration (current)   | 90.784 % |
| exploration (target)    | 92.728 % |
| mean 100 episode reward | 4.795    |
| steps                   | 382756   |
--------------------------------------
--------------------------------------
| episodes                | 329      |
| exploration (current)   | 88.437 % |
| exploration (target)    | 92.723 % |
| mean 100 episode reward | 4.775    |
| steps                   | 383010   |
--------------------------------------
step: 384635 IPS: 19.35
[renderer] preparing next video... 0 in the queue
--------------------------------------
| episodes                | 330      |
| exploration (current)   | 88.589 % |
| exploration (target)    | 92.692 % |
| mean 100 episode reward | 4.965    |
| steps                   | 384634   |
--------------------------------------
--------------------------------------
| episodes                | 331      |
| exploration (current)   | 9

--------------------------------------
| episodes                | 354      |
| exploration (current)   | 84.048 % |
| exploration (target)    | 92.341 % |
| mean 100 episode reward | 4.575    |
| steps                   | 403100   |
--------------------------------------
--------------------------------------
| episodes                | 355      |
| exploration (current)   | 92.239 % |
| exploration (target)    | 92.295 % |
| mean 100 episode reward | 4.710    |
| steps                   | 405517   |
--------------------------------------
--------------------------------------
| episodes                | 356      |
| exploration (current)   | 80.939 % |
| exploration (target)    | 92.249 % |
| mean 100 episode reward | 4.665    |
| steps                   | 407947   |
--------------------------------------
--------------------------------------
| episodes                | 357      |
| exploration (current)   | 95.235 % |
| exploration (target)    | 92.217 % |
| mean 100 episode reward