In [1]:
import argparse
import gym
from gym import wrappers
import os.path as osp
import random
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers

import dqn
from dqn_utils import *
from atari_wrappers import *

%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)


In [7]:
#a = -2 % 5
#print(a)

3


In [12]:
#a = [3, 8]
#print(*a)

3 8


In [2]:
def atari_model(img_in, num_actions, scope, reuse=False):
    # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
    with tf.variable_scope(scope, reuse=reuse):
        out = img_in
        with tf.variable_scope("convnet"):
            # original architecture
            out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
            out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
            out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
        out = layers.flatten(out)
        with tf.variable_scope("action_value"):
            out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
            out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)

        return out

def atari_learn(env,
                session,
                num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
                                         (0,                   1e-4 * lr_multiplier),
                                         (num_iterations / 10, 1e-4 * lr_multiplier),
                                         (num_iterations / 2,  5e-5 * lr_multiplier),
                                    ],
                                    outside_value=5e-5 * lr_multiplier)
    optimizer = dqn.OptimizerSpec(
        constructor=tf.train.AdamOptimizer,
        kwargs=dict(epsilon=1e-4),
        lr_schedule=lr_schedule
    )

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    exploration_schedule = PiecewiseSchedule(
        [
            (0, 1.0),
            (1e6, 0.1),
            (num_iterations / 2, 0.01),
        ], outside_value=0.01
    )

    dqn.learn(
        env,
        q_func=atari_model,
        optimizer_spec=optimizer,
        session=session,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        grad_norm_clipping=10
    )
    env.close()

def get_available_gpus():
    from tensorflow.python.client import device_lib
    local_device_protos = device_lib.list_local_devices()
    return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']

def set_global_seeds(i):
    try:
        import tensorflow as tf
    except ImportError:
        pass
    else:
        tf.set_random_seed(i) 
    np.random.seed(i)
    random.seed(i)

def get_session():
    tf.reset_default_graph()
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1)
    session = tf.Session(config=tf_config)
    print("AVAILABLE GPUS: ", get_available_gpus())
    return session

def get_env(task, seed):
    env_id = task.env_id

    env = gym.make(env_id)

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = '/tmp/hw3_vid_dir2/'
    env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
    env = wrap_deepmind(env)

    return env


In [3]:
def main_Atari():
    # Get Atari games.
    benchmark = gym.benchmark_spec('Atari40M')

    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = 0 # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)
    session = get_session()
    atari_learn(env, session, num_timesteps=task.max_timesteps)

In [8]:
main_Atari()

[2018-06-02 14:02:06,080] Making new env: PongNoFrameskip-v4
[2018-06-02 14:02:06,474] Clearing 14 monitor files from previous run (because force=True was provided)


AVAILABLE GPUS:  ['device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:06:00.0, compute capability: 6.1']


[2018-06-02 14:02:06,863] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000000.mp4
[2018-06-02 14:02:10,064] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000001.mp4
[2018-06-02 14:02:21,084] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000008.mp4
[2018-06-02 14:02:44,206] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000027.mp4
[2018-06-02 14:03:58,817] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000064.mp4


Timestep 60000
mean reward (100 episodes) -20.292308
best mean reward -inf
episodes 65
exploration 0.946000
learning_rate 0.000100
Timestep 70000
mean reward (100 episodes) -20.302632
best mean reward -inf
episodes 76
exploration 0.937000
learning_rate 0.000100
Timestep 80000
mean reward (100 episodes) -20.220930
best mean reward -inf
episodes 86
exploration 0.928000
learning_rate 0.000100
Timestep 90000
mean reward (100 episodes) -20.206186
best mean reward -inf
episodes 97
exploration 0.919000
learning_rate 0.000100
Timestep 100000
mean reward (100 episodes) -20.270000
best mean reward -20.200000
episodes 108
exploration 0.910000
learning_rate 0.000100
Timestep 110000
mean reward (100 episodes) -20.260000
best mean reward -20.200000
episodes 120
exploration 0.901000
learning_rate 0.000100


[2018-06-02 14:08:22,368] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000125.mp4


Timestep 120000
mean reward (100 episodes) -20.240000
best mean reward -20.200000
episodes 130
exploration 0.892000
learning_rate 0.000100
Timestep 130000
mean reward (100 episodes) -20.230000
best mean reward -20.160000
episodes 140
exploration 0.883000
learning_rate 0.000100
Timestep 140000
mean reward (100 episodes) -20.250000
best mean reward -20.160000
episodes 152
exploration 0.874000
learning_rate 0.000100
Timestep 150000
mean reward (100 episodes) -20.240000
best mean reward -20.160000
episodes 163
exploration 0.865000
learning_rate 0.000100
Timestep 160000
mean reward (100 episodes) -20.170000
best mean reward -20.160000
episodes 173
exploration 0.856000
learning_rate 0.000100
Timestep 170000
mean reward (100 episodes) -20.270000
best mean reward -20.160000
episodes 184
exploration 0.847000
learning_rate 0.000100
Timestep 180000
mean reward (100 episodes) -20.320000
best mean reward -20.160000
episodes 195
exploration 0.838000
learning_rate 0.000100
Timestep 190000
mean reward

[2018-06-02 14:15:18,063] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000216.mp4


Timestep 200000
mean reward (100 episodes) -20.250000
best mean reward -20.160000
episodes 216
exploration 0.820000
learning_rate 0.000100
Timestep 210000
mean reward (100 episodes) -20.170000
best mean reward -20.160000
episodes 226
exploration 0.811000
learning_rate 0.000100
Timestep 220000
mean reward (100 episodes) -20.250000
best mean reward -20.160000
episodes 237
exploration 0.802000
learning_rate 0.000100
Timestep 230000
mean reward (100 episodes) -20.240000
best mean reward -20.160000
episodes 248
exploration 0.793000
learning_rate 0.000100
Timestep 240000
mean reward (100 episodes) -20.160000
best mean reward -20.160000
episodes 259
exploration 0.784000
learning_rate 0.000100
Timestep 250000
mean reward (100 episodes) -20.230000
best mean reward -20.150000
episodes 270
exploration 0.775000
learning_rate 0.000100
Timestep 260000
mean reward (100 episodes) -20.210000
best mean reward -20.150000
episodes 281
exploration 0.766000
learning_rate 0.000100
Timestep 270000
mean reward

[2018-06-02 14:24:29,461] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000343.mp4


Timestep 320000
mean reward (100 episodes) -20.090000
best mean reward -20.040000
episodes 343
exploration 0.712000
learning_rate 0.000100
Timestep 330000
mean reward (100 episodes) -20.040000
best mean reward -20.020000
episodes 353
exploration 0.703000
learning_rate 0.000100
Timestep 340000
mean reward (100 episodes) -19.940000
best mean reward -19.940000
episodes 363
exploration 0.694000
learning_rate 0.000100
Timestep 350000
mean reward (100 episodes) -19.890000
best mean reward -19.870000
episodes 373
exploration 0.685000
learning_rate 0.000100
Timestep 360000
mean reward (100 episodes) -19.880000
best mean reward -19.870000
episodes 384
exploration 0.676000
learning_rate 0.000100
Timestep 370000
mean reward (100 episodes) -19.870000
best mean reward -19.860000
episodes 394
exploration 0.667000
learning_rate 0.000100
Timestep 380000
mean reward (100 episodes) -19.890000
best mean reward -19.840000
episodes 404
exploration 0.658000
learning_rate 0.000100
Timestep 390000
mean reward

[2018-06-02 14:39:46,967] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000512.mp4


Timestep 500000
mean reward (100 episodes) -19.710000
best mean reward -19.710000
episodes 514
exploration 0.550000
learning_rate 0.000100
Timestep 510000
mean reward (100 episodes) -19.600000
best mean reward -19.600000
episodes 523
exploration 0.541000
learning_rate 0.000100
Timestep 520000
mean reward (100 episodes) -19.550000
best mean reward -19.540000
episodes 531
exploration 0.532000
learning_rate 0.000100
Timestep 530000
mean reward (100 episodes) -19.520000
best mean reward -19.520000
episodes 538
exploration 0.523000
learning_rate 0.000100
Timestep 540000
mean reward (100 episodes) -19.400000
best mean reward -19.390000
episodes 546
exploration 0.514000
learning_rate 0.000100
Timestep 550000
mean reward (100 episodes) -19.310000
best mean reward -19.290000
episodes 553
exploration 0.505000
learning_rate 0.000100
Timestep 560000
mean reward (100 episodes) -19.160000
best mean reward -19.160000
episodes 559
exploration 0.496000
learning_rate 0.000100
Timestep 570000
mean reward

[2018-06-02 15:14:38,308] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video000729.mp4


Timestep 880000
mean reward (100 episodes) -15.090000
best mean reward -15.090000
episodes 731
exploration 0.208000
learning_rate 0.000100
Timestep 890000
mean reward (100 episodes) -14.970000
best mean reward -14.970000
episodes 736
exploration 0.199000
learning_rate 0.000100
Timestep 900000
mean reward (100 episodes) -14.670000
best mean reward -14.670000
episodes 739
exploration 0.190000
learning_rate 0.000100
Timestep 910000
mean reward (100 episodes) -14.500000
best mean reward -14.500000
episodes 744
exploration 0.181000
learning_rate 0.000100
Timestep 920000
mean reward (100 episodes) -14.390000
best mean reward -14.390000
episodes 748
exploration 0.172000
learning_rate 0.000100
Timestep 930000
mean reward (100 episodes) -14.400000
best mean reward -14.370000
episodes 753
exploration 0.163000
learning_rate 0.000100
Timestep 940000
mean reward (100 episodes) -14.340000
best mean reward -14.340000
episodes 757
exploration 0.154000
learning_rate 0.000100
Timestep 950000
mean reward

Timestep 1470000
mean reward (100 episodes) -8.810000
best mean reward -8.810000
episodes 956
exploration 0.089425
learning_rate 0.000094
Timestep 1480000
mean reward (100 episodes) -8.560000
best mean reward -8.560000
episodes 959
exploration 0.089200
learning_rate 0.000094
Timestep 1490000
mean reward (100 episodes) -8.100000
best mean reward -8.100000
episodes 962
exploration 0.088975
learning_rate 0.000094
Timestep 1500000
mean reward (100 episodes) -7.950000
best mean reward -7.950000
episodes 964
exploration 0.088750
learning_rate 0.000094
Timestep 1510000
mean reward (100 episodes) -7.500000
best mean reward -7.500000
episodes 968
exploration 0.088525
learning_rate 0.000094
Timestep 1520000
mean reward (100 episodes) -7.070000
best mean reward -7.070000
episodes 970
exploration 0.088300
learning_rate 0.000094
Timestep 1530000
mean reward (100 episodes) -6.580000
best mean reward -6.580000
episodes 973
exploration 0.088075
learning_rate 0.000093
Timestep 1540000
mean reward (100 

[2018-06-02 16:30:51,417] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video001000.mp4


Timestep 1630000
mean reward (100 episodes) -3.150000
best mean reward -3.150000
episodes 1002
exploration 0.085825
learning_rate 0.000092
Timestep 1640000
mean reward (100 episodes) -3.070000
best mean reward -3.060000
episodes 1004
exploration 0.085600
learning_rate 0.000092
Timestep 1650000
mean reward (100 episodes) -2.680000
best mean reward -2.680000
episodes 1008
exploration 0.085375
learning_rate 0.000092
Timestep 1660000
mean reward (100 episodes) -2.500000
best mean reward -2.500000
episodes 1010
exploration 0.085150
learning_rate 0.000092
Timestep 1670000
mean reward (100 episodes) -1.960000
best mean reward -1.960000
episodes 1013
exploration 0.084925
learning_rate 0.000092
Timestep 1680000
mean reward (100 episodes) -1.600000
best mean reward -1.600000
episodes 1016
exploration 0.084700
learning_rate 0.000092
Timestep 1690000
mean reward (100 episodes) -1.150000
best mean reward -1.150000
episodes 1019
exploration 0.084475
learning_rate 0.000091
Timestep 1700000
mean rewar

Timestep 2230000
mean reward (100 episodes) 16.100000
best mean reward 16.320000
episodes 1241
exploration 0.072325
learning_rate 0.000085
Timestep 2240000
mean reward (100 episodes) 16.140000
best mean reward 16.320000
episodes 1246
exploration 0.072100
learning_rate 0.000085
Timestep 2250000
mean reward (100 episodes) 16.080000
best mean reward 16.320000
episodes 1250
exploration 0.071875
learning_rate 0.000084
Timestep 2260000
mean reward (100 episodes) 16.190000
best mean reward 16.320000
episodes 1255
exploration 0.071650
learning_rate 0.000084
Timestep 2270000
mean reward (100 episodes) 16.200000
best mean reward 16.320000
episodes 1259
exploration 0.071425
learning_rate 0.000084
Timestep 2280000
mean reward (100 episodes) 16.280000
best mean reward 16.320000
episodes 1264
exploration 0.071200
learning_rate 0.000084
Timestep 2290000
mean reward (100 episodes) 16.490000
best mean reward 16.490000
episodes 1270
exploration 0.070975
learning_rate 0.000084
Timestep 2300000
mean rewar

Timestep 2820000
mean reward (100 episodes) 18.280000
best mean reward 18.330000
episodes 1524
exploration 0.059050
learning_rate 0.000077
Timestep 2830000
mean reward (100 episodes) 18.320000
best mean reward 18.340000
episodes 1529
exploration 0.058825
learning_rate 0.000077
Timestep 2840000
mean reward (100 episodes) 18.300000
best mean reward 18.340000
episodes 1533
exploration 0.058600
learning_rate 0.000077
Timestep 2850000
mean reward (100 episodes) 18.340000
best mean reward 18.340000
episodes 1538
exploration 0.058375
learning_rate 0.000077
Timestep 2860000
mean reward (100 episodes) 18.350000
best mean reward 18.360000
episodes 1543
exploration 0.058150
learning_rate 0.000077
Timestep 2870000
mean reward (100 episodes) 18.330000
best mean reward 18.360000
episodes 1547
exploration 0.057925
learning_rate 0.000077
Timestep 2880000
mean reward (100 episodes) 18.220000
best mean reward 18.370000
episodes 1552
exploration 0.057700
learning_rate 0.000077
Timestep 2890000
mean rewar

Timestep 3410000
mean reward (100 episodes) 18.600000
best mean reward 18.660000
episodes 1814
exploration 0.045775
learning_rate 0.000070
Timestep 3420000
mean reward (100 episodes) 18.620000
best mean reward 18.660000
episodes 1819
exploration 0.045550
learning_rate 0.000070
Timestep 3430000
mean reward (100 episodes) 18.560000
best mean reward 18.660000
episodes 1824
exploration 0.045325
learning_rate 0.000070
Timestep 3440000
mean reward (100 episodes) 18.550000
best mean reward 18.660000
episodes 1829
exploration 0.045100
learning_rate 0.000070
Timestep 3450000
mean reward (100 episodes) 18.530000
best mean reward 18.660000
episodes 1834
exploration 0.044875
learning_rate 0.000069
Timestep 3460000
mean reward (100 episodes) 18.670000
best mean reward 18.670000
episodes 1840
exploration 0.044650
learning_rate 0.000069
Timestep 3470000
mean reward (100 episodes) 18.690000
best mean reward 18.690000
episodes 1844
exploration 0.044425
learning_rate 0.000069
Timestep 3480000
mean rewar

[2018-06-02 20:13:26,846] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.4.3028.video002000.mp4


Timestep 3770000
mean reward (100 episodes) 19.430000
best mean reward 19.610000
episodes 2004
exploration 0.037675
learning_rate 0.000065
Timestep 3780000
mean reward (100 episodes) 19.500000
best mean reward 19.610000
episodes 2010
exploration 0.037450
learning_rate 0.000065
Timestep 3790000
mean reward (100 episodes) 19.500000
best mean reward 19.610000
episodes 2015
exploration 0.037225
learning_rate 0.000065
Timestep 3800000
mean reward (100 episodes) 19.530000
best mean reward 19.610000
episodes 2020
exploration 0.037000
learning_rate 0.000065
Timestep 3810000
mean reward (100 episodes) 19.530000
best mean reward 19.610000
episodes 2026
exploration 0.036775
learning_rate 0.000065
Timestep 3820000
mean reward (100 episodes) 19.510000
best mean reward 19.610000
episodes 2031
exploration 0.036550
learning_rate 0.000065
Timestep 3830000
mean reward (100 episodes) 19.490000
best mean reward 19.610000
episodes 2037
exploration 0.036325
learning_rate 0.000065
Timestep 3840000
mean rewar

Timestep 4360000
mean reward (100 episodes) 20.110000
best mean reward 20.170000
episodes 2334
exploration 0.024400
learning_rate 0.000058
Timestep 4370000
mean reward (100 episodes) 20.190000
best mean reward 20.190000
episodes 2340
exploration 0.024175
learning_rate 0.000058
Timestep 4380000
mean reward (100 episodes) 20.240000
best mean reward 20.240000
episodes 2346
exploration 0.023950
learning_rate 0.000058
Timestep 4390000
mean reward (100 episodes) 20.230000
best mean reward 20.270000
episodes 2352
exploration 0.023725
learning_rate 0.000058
Timestep 4400000
mean reward (100 episodes) 20.240000
best mean reward 20.270000
episodes 2357
exploration 0.023500
learning_rate 0.000058
Timestep 4410000
mean reward (100 episodes) 20.230000
best mean reward 20.280000
episodes 2363
exploration 0.023275
learning_rate 0.000057
Timestep 4420000
mean reward (100 episodes) 20.230000
best mean reward 20.280000
episodes 2369
exploration 0.023050
learning_rate 0.000057
Timestep 4430000
mean rewar

Timestep 4950000
mean reward (100 episodes) 20.210000
best mean reward 20.450000
episodes 2678
exploration 0.011125
learning_rate 0.000051
Timestep 4960000
mean reward (100 episodes) 20.250000
best mean reward 20.450000
episodes 2685
exploration 0.010900
learning_rate 0.000051
Timestep 4970000
mean reward (100 episodes) 20.270000
best mean reward 20.450000
episodes 2691
exploration 0.010675
learning_rate 0.000050
Timestep 4980000
mean reward (100 episodes) 20.300000
best mean reward 20.450000
episodes 2697
exploration 0.010450
learning_rate 0.000050
Timestep 4990000
mean reward (100 episodes) 20.380000
best mean reward 20.450000
episodes 2703
exploration 0.010225
learning_rate 0.000050
Timestep 5000000
mean reward (100 episodes) 20.340000
best mean reward 20.450000
episodes 2709
exploration 0.010000
learning_rate 0.000050
Timestep 5010000
mean reward (100 episodes) 20.320000
best mean reward 20.450000
episodes 2715
exploration 0.010000
learning_rate 0.000050
Timestep 5020000
mean rewar

KeyboardInterrupt: 