In [1]:
from player.player import Player
from environments.simulator import Atari
import numpy as np
import datetime
from utils import HandleResults
import numba


GAME_ENV = 'BreakoutDeterministic-v4'
# GAME_ENV = 'SpaceInvaders-v4' # 758 frames
# GAME_ENV = 'Alien-v4' # 948 frames
# GAME_ENV = 'Amidar-v4' # 812 frames
# GAME_ENV = 'Venture-v4'
# GAME_ENV = 'Assault-v4' # 876 frames
# GAME_ENV = 'RoadRunner-v4' # 437 frames
# GAME_ENV = 'PongDeterministic-v4'
# GAME_ENV = 'Asterix-v4'
# GAME_ENV = 'MontezumaRevenge-v4'
# GAME_ENV = 'ChopperCommand-v4'
# OUT_FOLDER = './output/Punish_0_No_Reward_exploration/'
# OUT_FOLDER = './output/Punish_1_No_Reward_exploration/'
OUT_FOLDER = './output/Punish_1_Reward_exploration_2/'

results_handler = HandleResults(GAME_ENV, OUT_FOLDER)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
def run_episode(max_episode_length, episode, game_env, player, total_frames, evaluation=False):
    terminal_life_lost = game_env.reset()
    episode_reward = 0
    life_seq = 0
    frame_number = 0
    gif_frames = []
    while True:
        # Get state, make action, get next state (rewards, terminal, ...), record the experience, train if necessary
        current_state = game_env.get_current_state()
        action = player.take_action(current_state, total_frames, evaluation)
        processed_new_frame, reward, terminal, terminal_life_lost, original_frame = game_env.step(action)

        if frame_number >= max_episode_length:
            terminal = True
            terminal_life_lost = True

        # if evaluation:
        #     gif_frames.append(original_frame)

        if not evaluation:
            player.updates(total_frames, episode, action, processed_new_frame, reward, terminal_life_lost, life_seq)

        episode_reward += reward
        life_seq += 1

        if terminal_life_lost:
            life_seq = 0

        # game_env.env.render()
        total_frames += 1
        frame_number += 1

        if terminal:
            break

    return episode_reward, total_frames


In [5]:
load_folder=''
load_model=False

if load_folder is not '':
    player, game_env, max_episode_length, max_number_of_episodes, all_settings = \
        results_handler.load_settings(load_folder, load_model)
else:
    player, game_env, max_episode_length, max_number_of_episodes, all_settings = \
        results_handler.load_default_settings(GAME_ENV)

for k, v in all_settings.items():
    print(k, ': ', v)

print('****************************')

results_handler.save_settings(all_settings, player)

The environment has the following 4 actions: ['NOOP', 'FIRE', 'RIGHT', 'LEFT']
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 84, 84, 4)    0                                            
__________________________________________________________________________________________________
norm (Lambda)                   (None, 84, 84, 4)    0           input_5[0][0]                    
__________________________________________________________________________________________________
conv2d_13 (Conv2D)              (None, 20, 20, 32)   8224        norm[0][0]                       
__________________________________________________________________________________________________
conv2d_14 (Conv2D)              (None, 9, 9, 64)     32832       conv2d_13[0][0]                  
______________________________

In [4]:
res_dict = {}

highest_reward = 0
total_frames = 0.0
prev_frames = 0.0
all_rewards = []
time = datetime.datetime.now()
prev_time = time
best_evaluation = 0

for episode in range(max_number_of_episodes):
    episode_reward, total_frames = run_episode(max_episode_length, episode, game_env, player, total_frames)

    # all_rewards[episode] = episode_reward
    all_rewards.append(episode_reward)

    if episode_reward>highest_reward:
        highest_reward = episode_reward

    if episode % 10 == 0:
        # evaluation_reward, _ = run_episode(max_episode_length, episode, game_env, player, 0, evaluation=True)

        # if evaluation_reward > best_evaluation:
        #     best_evaluation = evaluation_reward
            # print('Best eval: ', str(best_evaluation))

        now = datetime.datetime.now()
        res_dict['time'] = str(now - time)
        res_dict['episode'] = episode
        res_dict['total_frames'] = total_frames
        res_dict['epsilon'] = format(player.epsilon, '.3f')
        res_dict['highest_reward'] = highest_reward
        # res_dict['best_eval'] = best_evaluation
        res_dict['mean_rewards'] = np.mean(all_rewards[-10:])
        res_dict['mean_loss'] = format(np.mean(player.losses[-10:]), '.5f')
        # res_dict['memory_vol'] = player.memory.count
        # res_dict['fps'] = (total_frames - prev_frames) / ((now - prev_time).total_seconds())
        # res_dict['sparsity'] = np.mean(player.memory.sparsity_lengths[-10:])
        res_dict['estimating_reward'] = player.memory.use_estimated_reward
        res_dict['reward_exponent'] = player.memory.reward_extrapolation_exponent

        results_handler.save_res(res_dict)

        prev_time = now
        prev_frames = total_frames

  warn("The default mode, 'constant', will be changed to 'reflect' in "


{'time': '0:00:01.563906', 'episode': 0, 'total_frames': 199.0, 'epsilon': '1.000', 'highest_reward': 1.0, 'mean_rewards': 1.0, 'mean_loss': 'nan', 'sparsity': 31.333333333333332}


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [19]:
from gym import envs
all_envs = list(envs.registry.all())

for env in all_envs:
    if ('v4' in env.id):
        m = env.make()
        if (m.observation_space.shape[0] >= 200) and ('Deterministic' in env.id):
            n = 0      
            r = 0
            max_r = -100000
            for i in range(0, 10):
                observation = m.reset()
                for _ in range(0, 5):
                    m.step(1)
                j = 0    
                while True:
                    action = m.action_space.sample()
                    observation, reward, done, info = m.step(action)
                    n += 1
                    r += reward
                    j += 1
                    max_r = np.max((reward, max_r))
                    if done or (j>6000):
                        break                    
                    
            m.close()
            print(env.id, ', ', m.action_space, ', ', n/10.0, ', ', r/10.0, ', ', max_r)


i=0




AdventureDeterministic-v4 ,  Discrete(18) ,  5095.7 ,  -0.2 ,  0.0
AirRaidDeterministic-v4 ,  Discrete(6) ,  580.0 ,  595.0 ,  100.0
AlienDeterministic-v4 ,  Discrete(18) ,  723.5 ,  221.0 ,  20.0
AmidarDeterministic-v4 ,  Discrete(10) ,  612.1 ,  2.9 ,  5.0
AssaultDeterministic-v4 ,  Discrete(7) ,  538.1 ,  199.5 ,  21.0
AsterixDeterministic-v4 ,  Discrete(9) ,  273.1 ,  300.0 ,  100.0
AsteroidsDeterministic-v4 ,  Discrete(14) ,  1086.1 ,  884.0 ,  150.0
AtlantisDeterministic-v4 ,  Discrete(4) ,  1286.8 ,  17510.0 ,  3500.0
BankHeistDeterministic-v4 ,  Discrete(18) ,  538.6 ,  17.0 ,  20.0
BattleZoneDeterministic-v4 ,  Discrete(18) ,  1173.5 ,  2300.0 ,  6000.0
BeamRiderDeterministic-v4 ,  Discrete(9) ,  1507.7 ,  369.6 ,  44.0
BerzerkDeterministic-v4 ,  Discrete(18) ,  220.3 ,  150.0 ,  50.0
BowlingDeterministic-v4 ,  Discrete(6) ,  2104.9 ,  22.7 ,  7.0
BoxingDeterministic-v4 ,  Discrete(18) ,  1781.0 ,  3.4 ,  2.0
BreakoutDeterministic-v4 ,  Discrete(4) ,  194.6 ,  1.7 ,  1.0
Carni

In [10]:
m.action_space

Discrete(18)