In [1]:
from player.player import Player
from environments.simulator import Atari
import numpy as np
import datetime
from utils import HandleResults
import numba

'''
Set the main settings in the default_settings.jsn
* PUNISH controls the positive punishment, to be set to non-zero to work
* REWARD_EXTRAPOLATION_EXPONENT controls the exponent for backfilling. Set to -1.0 to turn this off (i.e., use the 
actual reward values only)

Some settings are in the memory class:
* START_EPISODE: This will be the start episode of the linear increase. 
* END_EPISODE: This will be the end episode of the linear increase.
* START_EXPONENT: The start exponent (1.0)
* END_EXPONENT: Final exponent value (10.0)
* IGNORE_EXPONENT_EPISODE: At what episode ignore using the exponent (just punishment if on)
 '''


def run_episode(max_episode_length, episode, game_env, player, total_frames, evaluation=False):
    terminal_life_lost = game_env.reset()
    episode_reward = 0
    life_seq = 0
    frame_number = 0
    gif_frames = []
    while True:
        # Get state, make action, get next state (rewards, terminal, ...), record the experience, train if necessary
        current_state = game_env.get_current_state()
        action = player.take_action(current_state, total_frames, evaluation)
        processed_new_frame, reward, terminal, terminal_life_lost, original_frame = game_env.step(action)

        if frame_number >= max_episode_length:
            terminal = True
            terminal_life_lost = True

        # if evaluation:
        #     gif_frames.append(original_frame)

        if not evaluation:
            player.updates(total_frames, episode, action, processed_new_frame, reward, terminal_life_lost, life_seq)

        episode_reward += reward
        life_seq += 1

        if terminal_life_lost:
            life_seq = 0

        # game_env.env.render()
        total_frames += 1
        frame_number += 1

        if terminal:
            break

    return episode_reward, total_frames


def learn_by_game(results_handler, load_folder='', load_model=False):

    if load_folder is not '':
        player, game_env, max_episode_length, max_number_of_episodes, all_settings = \
            results_handler.load_settings_folder(load_folder, load_model)
    else:
        player, game_env, max_episode_length, max_number_of_episodes, all_settings = \
            results_handler.load_settings_default(GAME_ENV)

    for k, v in all_settings.items():
        print(k, ': ', v)

    print('****************************')

    results_handler.save_settings(all_settings, player)
    res_dict = {}

    highest_reward = 0
    total_frames = 0.0
    prev_frames = 0.0
    all_rewards = []
    time = datetime.datetime.now()
    prev_time = time
    best_evaluation = 0

    for episode in range(max_number_of_episodes):
        episode_reward, total_frames = run_episode(max_episode_length, episode, game_env, player, total_frames)

        # all_rewards[episode] = episode_reward
        all_rewards.append(episode_reward)

        if episode_reward>highest_reward:
            highest_reward = episode_reward

        if episode % 10 == 0:
            # evaluation_reward, _ = run_episode(max_episode_length, episode, game_env, player, 0, evaluation=True)

            # if evaluation_reward > best_evaluation:
            #     best_evaluation = evaluation_reward
                # print('Best eval: ', str(best_evaluation))

            now = datetime.datetime.now()
            res_dict['time'] = str(now - time)
            res_dict['episode'] = episode
            res_dict['total_frames'] = total_frames
            res_dict['epsilon'] = format(player.epsilon, '.3f')
            res_dict['highest_reward'] = highest_reward
            # res_dict['best_eval'] = best_evaluation
            res_dict['mean_rewards'] = np.mean(all_rewards[-10:])
            res_dict['mean_loss'] = format(np.mean(player.losses[-10:]), '.5f')
            # res_dict['memory_vol'] = player.memory.count
            # res_dict['fps'] = (total_frames - prev_frames) / ((now - prev_time).total_seconds())
            # res_dict['sparsity'] = np.mean(player.memory.sparsity_lengths[-10:])
            res_dict['estimating_reward'] = player.memory.use_estimated_reward
            res_dict['reward_exponent'] = player.memory.reward_extrapolation_exponent

            results_handler.save_res(res_dict)

            prev_time = now
            prev_frames = total_frames

# GAME_ENV = 'BreakoutDeterministic-v4'
# GAME_ENV = 'BerzerkDeterministic-v4'
# GAME_ENV = 'QbertDeterministic-v4'
# GAME_ENV = 'SpaceInvaders-v4' # 758 frames
# GAME_ENV = 'Alien-v4' # 948 frames
# GAME_ENV = 'Amidar-v4' # 812 frames
# GAME_ENV = 'Venture-v4'
# GAME_ENV = 'Assault-v4' # 876 frames
# GAME_ENV = 'RoadRunner-v4' # 437 frames
# GAME_ENV = 'PongDeterministic-v4'
# GAME_ENV = 'AsterixDeterministic-v4'
# GAME_ENV = 'MontezumaRevenge-v4'
# GAME_ENV = 'ChopperCommand-v4'
# OUT_FOLDER = './output/Punish_0_No_Reward_exploration/'
# OUT_FOLDER = './output/Punish_1_No_Reward_exploration/'
# OUT_FOLDER = './output/Punish_1_Reward_exploration_linear/'
OUT_FOLDER = './output/punish/'

# games = [, 'QbertDeterministic-v4']
games = ['BreakoutDeterministic-v4', 'AsterixDeterministic-v4', 'CarnivalDeterministic-v4', 'MsPacmanDeterministic-v4', 
         'UpNDownDeterministic-v4', 'AssaultDeterministic-v4']


for GAME_ENV in games:
    handler = HandleResults(GAME_ENV, OUT_FOLDER)
    learn_by_game(handler)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


The environment has the following 4 actions: ['NOOP', 'FIRE', 'RIGHT', 'LEFT']
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 84, 84, 4)    0                                            
__________________________________________________________________________________________________
norm (Lambda)                   (None, 84, 84, 4)    0           input_2[0][0]                    
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 20, 20, 32)   8224        norm[0][0]                       
__________________________________________________________________________________________________
conv2d_5 (Conv2D)               (None, 9, 9, 64)     32832       conv2d_4[0][0]                   
______________________________

  warn("The default mode, 'constant', will be changed to 'reflect' in "
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


{'time': '0:00:00.269540', 'episode': 0, 'total_frames': 125.0, 'epsilon': '1.000', 'highest_reward': 0, 'mean_rewards': 0.0, 'mean_loss': 'nan', 'estimating_reward': False, 'reward_exponent': 1.0}
{'time': '0:00:04.126378', 'episode': 10, 'total_frames': 1975.0, 'epsilon': '1.000', 'highest_reward': 8.0, 'mean_rewards': 1.7, 'mean_loss': 'nan', 'estimating_reward': False, 'reward_exponent': 1.0}
{'time': '0:00:07.303785', 'episode': 20, 'total_frames': 3494.0, 'epsilon': '1.000', 'highest_reward': 8.0, 'mean_rewards': 0.4, 'mean_loss': 'nan', 'estimating_reward': False, 'reward_exponent': 1.0}
{'time': '0:00:11.749363', 'episode': 30, 'total_frames': 5639.0, 'epsilon': '1.000', 'highest_reward': 8.0, 'mean_rewards': 2.2, 'mean_loss': 'nan', 'estimating_reward': False, 'reward_exponent': 1.0}
{'time': '0:00:15.257809', 'episode': 40, 'total_frames': 7301.0, 'epsilon': '1.000', 'highest_reward': 8.0, 'mean_rewards': 0.8, 'mean_loss': 'nan', 'estimating_reward': False, 'reward_exponent':

Updating the target network
{'time': '0:08:32.582710', 'episode': 400, 'total_frames': 70743.0, 'epsilon': '0.793', 'highest_reward': 8.0, 'mean_rewards': 1.4, 'mean_loss': '0.00245', 'estimating_reward': False, 'reward_exponent': 1.0}
{'time': '0:09:04.035005', 'episode': 410, 'total_frames': 72319.0, 'epsilon': '0.777', 'highest_reward': 8.0, 'mean_rewards': 0.8, 'mean_loss': '0.00161', 'estimating_reward': False, 'reward_exponent': 1.2785714285714285}
{'time': '0:09:41.901766', 'episode': 420, 'total_frames': 74195.0, 'epsilon': '0.758', 'highest_reward': 8.0, 'mean_rewards': 1.5, 'mean_loss': '0.00233', 'estimating_reward': False, 'reward_exponent': 1.5571428571428572}
{'time': '0:10:14.747048', 'episode': 430, 'total_frames': 75810.0, 'epsilon': '0.742', 'highest_reward': 8.0, 'mean_rewards': 0.9, 'mean_loss': '0.00148', 'estimating_reward': False, 'reward_exponent': 1.8357142857142859}
{'time': '0:10:53.399419', 'episode': 440, 'total_frames': 77695.0, 'epsilon': '0.723', 'highes

Updating the target network
{'time': '0:35:14.258819', 'episode': 770, 'total_frames': 142057.0, 'epsilon': '0.100', 'highest_reward': 11.0, 'mean_rewards': 2.9, 'mean_loss': '0.00100', 'estimating_reward': False, 'reward_exponent': 11.307142857142857}
{'time': '0:36:04.339276', 'episode': 780, 'total_frames': 144122.0, 'epsilon': '0.100', 'highest_reward': 11.0, 'mean_rewards': 2.1, 'mean_loss': '0.00218', 'estimating_reward': False, 'reward_exponent': 11.585714285714285}
{'time': '0:37:13.429023', 'episode': 790, 'total_frames': 146948.0, 'epsilon': '0.100', 'highest_reward': 11.0, 'mean_rewards': 4.9, 'mean_loss': '0.00079', 'estimating_reward': False, 'reward_exponent': 11.864285714285714}
{'time': '0:38:11.496807', 'episode': 800, 'total_frames': 149305.0, 'epsilon': '0.100', 'highest_reward': 11.0, 'mean_rewards': 2.9, 'mean_loss': '0.00037', 'estimating_reward': False, 'reward_exponent': 12.142857142857142}
Updating the target network
{'time': '0:39:08.470404', 'episode': 810, '

Updating the target network
{'time': '1:24:37.995018', 'episode': 1130, 'total_frames': 262426.0, 'epsilon': '0.100', 'highest_reward': 36.0, 'mean_rewards': 17.9, 'mean_loss': '0.00123', 'estimating_reward': False, 'reward_exponent': 21.335714285714285}
{'time': '1:27:08.256590', 'episode': 1140, 'total_frames': 268540.0, 'epsilon': '0.100', 'highest_reward': 36.0, 'mean_rewards': 18.2, 'mean_loss': '0.00146', 'estimating_reward': False, 'reward_exponent': 21.614285714285714}
Updating the target network
{'time': '1:29:27.571427', 'episode': 1150, 'total_frames': 274216.0, 'epsilon': '0.100', 'highest_reward': 36.0, 'mean_rewards': 16.8, 'mean_loss': '0.00164', 'estimating_reward': False, 'reward_exponent': 21.892857142857142}
Updating the target network
{'time': '1:31:58.905322', 'episode': 1160, 'total_frames': 280391.0, 'epsilon': '0.100', 'highest_reward': 36.0, 'mean_rewards': 20.0, 'mean_loss': '0.00143', 'estimating_reward': False, 'reward_exponent': 22.17142857142857}
{'time': 

Updating the target network
{'time': '3:11:16.063723', 'episode': 1470, 'total_frames': 522632.0, 'epsilon': '0.100', 'highest_reward': 72.0, 'mean_rewards': 32.1, 'mean_loss': '0.00203', 'estimating_reward': False, 'reward_exponent': 30.807142857142857}


KeyboardInterrupt: 