In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
from imagenet_dataset import get_dataset
from retina_env import RetinaEnv, calculate_retinal_filter
from rl_networks import create_actor_model, create_critic_model, policy
from rl_core import Buffer, update_target
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import types
config = types.SimpleNamespace()
config.batch_size = 32
config.margin = 20
config.image_h = 224
config.image_w = 224
config.image_hm = config.image_h+2*config.margin
config.image_wm = config.image_w+2*config.margin
config.foveate = None
config.do_grayscale = True
config.history_length = 16
config.t_ignore = 16
config.t_max =50
config.motion_mode = 'velocity'

config.gym_mode = False
t_vec = np.linspace(0,150,16)

balanced_filter = calculate_retinal_filter(t_vec, R=1.0)
config.filter = balanced_filter.reshape([1,1,-1,1])
config.min_freq = 1
config.max_freq = 13
config.action_upper_bound = np.array([2.0, 2.0])

dataset_dir = '/home/bnapp/datasets/tensorflow_datasets/imagenet2012/5.0.0/'

def epsilon_scheduler(episode, floor_episode=200, epsilon_floor=0.1):
    if episode < floor_episode:
        return 1.-(1.-epsilon_floor)*episode/floor_episode
    else:
        return epsilon_floor

dataset = get_dataset(dataset_dir,
                                     'validation',
                                     config.batch_size,
                                     image_h = config.image_hm,
                                     image_w = config.image_wm,
                                     preprocessing='identity',
                                     rggb_mode=False,
                                     central_squeeze_and_pad_factor=-1)

env = RetinaEnv(config, image_generator=dataset)

if config.gym_mode:
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    upper_bound = env.action_space.high[0]
    lower_bound = env.action_space.low[0]
else:
    num_states = env.observation_size
    num_actions = env.action_size
    upper_bound = env.action_upper_bound
    lower_bound = env.action_lower_bound

# You might want to adjust the hyperparameters
actor_lr = 0.001
critic_lr = 0.002
gamma = 0.99
tau = 0.005

buffer_capacity = 100000
batch_size = 64



# Create actor and critic networks
actor_model = create_actor_model(env.image_h, env.image_w, env.spectral_density_size, env.location_history_size, env.timestep_size, env.action_size)
critic_model = create_critic_model(env.image_h, env.image_w, env.spectral_density_size, env.location_history_size, env.timestep_size, env.action_size)

# Create target actor and critic networks
target_actor = create_actor_model(env.image_h, env.image_w, env.spectral_density_size, env.location_history_size, env.timestep_size, env.action_size)
target_critic = create_critic_model(env.image_h, env.image_w, env.spectral_density_size, env.location_history_size, env.timestep_size, env.action_size)

# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

# Learning rate for actor-critic models
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

# Experience replay buffer
buffer = Buffer(buffer_capacity, config.batch_size, num_states=num_states, num_actions=num_actions,
                state_reshape_fn=env.unflatten_observation)

 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Instructions for updating:
Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by `tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data optimizations will take care of using the fused implementation.


In [2]:
# Training loop
reward_records = []
epsilon_records = []
episodes = 1000
for ep in range(episodes):
    prev_state = env.reset()
    episodic_reward = 0
    epsilon = epsilon_scheduler(ep)

    while True:
        # tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        
        deterministic_action = policy(env.unflatten_observation(prev_state), actor_model, lower_bound, upper_bound)
        random_action = -2+4*np.random.uniform(size=(config.batch_size,2))

        if env.warmup_done:
            action = epsilon*random_action + (1-epsilon)*deterministic_action
        else:
            action = random_action

        # Recieve state and reward from environment
        state, reward, done, info = env.step(action)
        
        if env.warmup_done:
            buffer.record((prev_state, action, reward, state))
            episodic_reward += reward

            buffer.learn(actor_model, target_actor, critic_model, target_critic, actor_optimizer, critic_optimizer, gamma, tau)
            update_target(target_actor.variables, actor_model.variables, tau)
            update_target(target_critic.variables, critic_model.variables, tau)
        # End this episode when `done` is True
        if done:
            break

        #prev_state = state avoid assingment by reference:
        prev_state = np.copy(state)
#         print('debug action: ', action[0])

    print(f"Episode * {ep} * exploration epsilon {epsilon} * Episodic Reward is ==> {episodic_reward.numpy().mean()}")
    reward_records.append(episodic_reward.numpy().mean())
    epsilon_records.append(epsilon)

Episode * 0 * exploration epsilon 1.0 * Episodic Reward is ==> -12.714249610900879
Episode * 1 * exploration epsilon 0.9955 * Episodic Reward is ==> -11.251128196716309
Episode * 2 * exploration epsilon 0.991 * Episodic Reward is ==> -9.977132797241211
Episode * 3 * exploration epsilon 0.9865 * Episodic Reward is ==> -11.792316436767578
Episode * 4 * exploration epsilon 0.982 * Episodic Reward is ==> -10.736371994018555
Episode * 5 * exploration epsilon 0.9775 * Episodic Reward is ==> -10.642513275146484
Episode * 6 * exploration epsilon 0.973 * Episodic Reward is ==> -11.361754417419434
Episode * 7 * exploration epsilon 0.9685 * Episodic Reward is ==> -10.637805938720703
Episode * 8 * exploration epsilon 0.964 * Episodic Reward is ==> -11.512472152709961
Episode * 9 * exploration epsilon 0.9595 * Episodic Reward is ==> -13.279670715332031
Episode * 10 * exploration epsilon 0.955 * Episodic Reward is ==> -11.39126968383789
Episode * 11 * exploration epsilon 0.9505 * Episodic Reward is 

Episode * 24 * exploration epsilon 0.892 * Episodic Reward is ==> -10.672374725341797
Episode * 25 * exploration epsilon 0.8875 * Episodic Reward is ==> -11.165483474731445
Episode * 26 * exploration epsilon 0.883 * Episodic Reward is ==> -12.978475570678711
Episode * 27 * exploration epsilon 0.8785000000000001 * Episodic Reward is ==> -11.276407241821289
Episode * 28 * exploration epsilon 0.874 * Episodic Reward is ==> -11.513321876525879
Episode * 29 * exploration epsilon 0.8694999999999999 * Episodic Reward is ==> -9.960307121276855
Episode * 30 * exploration epsilon 0.865 * Episodic Reward is ==> -13.189055442810059
Episode * 31 * exploration epsilon 0.8605 * Episodic Reward is ==> -11.737686157226562
Episode * 32 * exploration epsilon 0.856 * Episodic Reward is ==> -12.038890838623047
Episode * 33 * exploration epsilon 0.8515 * Episodic Reward is ==> -12.37426471710205
Episode * 34 * exploration epsilon 0.847 * Episodic Reward is ==> -13.726275444030762
Episode * 35 * exploration 

Episode * 48 * exploration epsilon 0.784 * Episodic Reward is ==> -12.468976974487305
Episode * 49 * exploration epsilon 0.7795 * Episodic Reward is ==> -11.01706314086914
Episode * 50 * exploration epsilon 0.775 * Episodic Reward is ==> -12.709945678710938
Episode * 51 * exploration epsilon 0.7705 * Episodic Reward is ==> -13.433777809143066
Episode * 52 * exploration epsilon 0.766 * Episodic Reward is ==> -11.55752944946289
Episode * 53 * exploration epsilon 0.7615 * Episodic Reward is ==> -11.912851333618164
Episode * 54 * exploration epsilon 0.757 * Episodic Reward is ==> -11.99441909790039
Episode * 55 * exploration epsilon 0.7525 * Episodic Reward is ==> -12.880372047424316
Episode * 56 * exploration epsilon 0.748 * Episodic Reward is ==> -14.45364761352539
Episode * 57 * exploration epsilon 0.7435 * Episodic Reward is ==> -12.182701110839844
Episode * 58 * exploration epsilon 0.739 * Episodic Reward is ==> -10.817619323730469
Episode * 59 * exploration epsilon 0.7344999999999999

Episode * 72 * exploration epsilon 0.6759999999999999 * Episodic Reward is ==> -15.45425033569336
Episode * 73 * exploration epsilon 0.6715 * Episodic Reward is ==> -12.716184616088867
Episode * 74 * exploration epsilon 0.667 * Episodic Reward is ==> -12.622576713562012
Episode * 75 * exploration epsilon 0.6625 * Episodic Reward is ==> -13.735642433166504
Episode * 76 * exploration epsilon 0.6579999999999999 * Episodic Reward is ==> -11.71760082244873
Episode * 77 * exploration epsilon 0.6535 * Episodic Reward is ==> -12.875324249267578
Episode * 78 * exploration epsilon 0.649 * Episodic Reward is ==> -13.273508071899414
Episode * 79 * exploration epsilon 0.6445 * Episodic Reward is ==> -13.404001235961914
Episode * 80 * exploration epsilon 0.64 * Episodic Reward is ==> -12.134263038635254
Episode * 81 * exploration epsilon 0.6355 * Episodic Reward is ==> -13.513896942138672
Episode * 82 * exploration epsilon 0.631 * Episodic Reward is ==> -11.187618255615234
Episode * 83 * exploration

Episode * 96 * exploration epsilon 0.568 * Episodic Reward is ==> -15.440045356750488
Episode * 97 * exploration epsilon 0.5635 * Episodic Reward is ==> -13.384469985961914
Episode * 98 * exploration epsilon 0.5589999999999999 * Episodic Reward is ==> -12.557055473327637
Episode * 99 * exploration epsilon 0.5545 * Episodic Reward is ==> -13.523968696594238
Episode * 100 * exploration epsilon 0.55 * Episodic Reward is ==> -13.072657585144043
Episode * 101 * exploration epsilon 0.5455 * Episodic Reward is ==> -12.458277702331543
Episode * 102 * exploration epsilon 0.541 * Episodic Reward is ==> -12.816003799438477
Episode * 103 * exploration epsilon 0.5365 * Episodic Reward is ==> -13.718799591064453
Episode * 104 * exploration epsilon 0.532 * Episodic Reward is ==> -13.14460563659668
Episode * 105 * exploration epsilon 0.5275000000000001 * Episodic Reward is ==> -15.435712814331055
Episode * 106 * exploration epsilon 0.5229999999999999 * Episodic Reward is ==> -13.422088623046875
Episod

Episode * 120 * exploration epsilon 0.45999999999999996 * Episodic Reward is ==> -15.964147567749023
Episode * 121 * exploration epsilon 0.4555 * Episodic Reward is ==> -12.929402351379395
Episode * 122 * exploration epsilon 0.45100000000000007 * Episodic Reward is ==> -13.916473388671875
Episode * 123 * exploration epsilon 0.4465 * Episodic Reward is ==> -12.992618560791016
Episode * 124 * exploration epsilon 0.44199999999999995 * Episodic Reward is ==> -14.595807075500488
Episode * 125 * exploration epsilon 0.4375 * Episodic Reward is ==> -14.487719535827637
Episode * 126 * exploration epsilon 0.43299999999999994 * Episodic Reward is ==> -14.424210548400879
Episode * 127 * exploration epsilon 0.4285 * Episodic Reward is ==> -13.861406326293945
Episode * 128 * exploration epsilon 0.42399999999999993 * Episodic Reward is ==> -14.23365306854248
Episode * 129 * exploration epsilon 0.4195 * Episodic Reward is ==> -14.096843719482422
Episode * 130 * exploration epsilon 0.41500000000000004 

Episode * 143 * exploration epsilon 0.35649999999999993 * Episodic Reward is ==> -14.90014934539795
Episode * 144 * exploration epsilon 0.352 * Episodic Reward is ==> -15.618305206298828
Episode * 145 * exploration epsilon 0.34750000000000003 * Episodic Reward is ==> -15.893184661865234
Episode * 146 * exploration epsilon 0.34299999999999997 * Episodic Reward is ==> -16.0598087310791
Episode * 147 * exploration epsilon 0.3384999999999999 * Episodic Reward is ==> -16.403247833251953
Episode * 148 * exploration epsilon 0.33399999999999996 * Episodic Reward is ==> -14.314970970153809
Episode * 149 * exploration epsilon 0.3295 * Episodic Reward is ==> -13.997321128845215
Episode * 150 * exploration epsilon 0.32499999999999996 * Episodic Reward is ==> -14.87734603881836
Episode * 151 * exploration epsilon 0.3205 * Episodic Reward is ==> -14.183988571166992
Episode * 152 * exploration epsilon 0.31599999999999995 * Episodic Reward is ==> -14.434392929077148
Episode * 153 * exploration epsilon

Episode * 167 * exploration epsilon 0.24849999999999994 * Episodic Reward is ==> -14.550599098205566
Episode * 168 * exploration epsilon 0.24399999999999988 * Episodic Reward is ==> -15.833627700805664
Episode * 169 * exploration epsilon 0.23950000000000005 * Episodic Reward is ==> -15.408834457397461
Episode * 170 * exploration epsilon 0.235 * Episodic Reward is ==> -15.988094329833984
Episode * 171 * exploration epsilon 0.23049999999999993 * Episodic Reward is ==> -15.690065383911133
Episode * 172 * exploration epsilon 0.22599999999999998 * Episodic Reward is ==> -15.981084823608398
Episode * 173 * exploration epsilon 0.22149999999999992 * Episodic Reward is ==> -16.099285125732422
Episode * 174 * exploration epsilon 0.21700000000000008 * Episodic Reward is ==> -17.01306915283203
Episode * 175 * exploration epsilon 0.21250000000000002 * Episodic Reward is ==> -15.582632064819336
Episode * 176 * exploration epsilon 0.20799999999999996 * Episodic Reward is ==> -14.487226486206055
Episo

Episode * 190 * exploration epsilon 0.14500000000000002 * Episodic Reward is ==> -15.675548553466797
Episode * 191 * exploration epsilon 0.14049999999999996 * Episodic Reward is ==> -17.023279190063477
Episode * 192 * exploration epsilon 0.1359999999999999 * Episodic Reward is ==> -16.821746826171875
Episode * 193 * exploration epsilon 0.13149999999999995 * Episodic Reward is ==> -18.111347198486328
Episode * 194 * exploration epsilon 0.127 * Episodic Reward is ==> -16.88080406188965
Episode * 195 * exploration epsilon 0.12250000000000005 * Episodic Reward is ==> -15.289530754089355
Episode * 196 * exploration epsilon 0.118 * Episodic Reward is ==> -15.692169189453125
Episode * 197 * exploration epsilon 0.11349999999999993 * Episodic Reward is ==> -16.64896011352539
Episode * 198 * exploration epsilon 0.10899999999999987 * Episodic Reward is ==> -18.18808364868164
Episode * 199 * exploration epsilon 0.10450000000000004 * Episodic Reward is ==> -17.233034133911133


NameError: name 'floor_epsilon' is not defined

In [None]:
for ep in range(200,episodes):
    prev_state = env.reset()
    episodic_reward = 0
    epsilon = epsilon_scheduler(ep)

    while True:
        # tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        
        deterministic_action = policy(env.unflatten_observation(prev_state), actor_model, lower_bound, upper_bound)
        random_action = -2+4*np.random.uniform(size=(config.batch_size,2))

        if env.warmup_done:
            action = epsilon*random_action + (1-epsilon)*deterministic_action
        else:
            action = random_action

        # Recieve state and reward from environment
        state, reward, done, info = env.step(action)
        
        if env.warmup_done:
            buffer.record((prev_state, action, reward, state))
            episodic_reward += reward

            buffer.learn(actor_model, target_actor, critic_model, target_critic, actor_optimizer, critic_optimizer, gamma, tau)
            update_target(target_actor.variables, actor_model.variables, tau)
            update_target(target_critic.variables, critic_model.variables, tau)
        # End this episode when `done` is True
        if done:
            break

        #prev_state = state avoid assingment by reference:
        prev_state = np.copy(state)
#         print('debug action: ', action[0])

    print(f"Episode * {ep} * exploration epsilon {epsilon} * Episodic Reward is ==> {episodic_reward.numpy().mean()}")
    reward_records.append(episodic_reward.numpy().mean())
    epsilon_records.append(epsilon)

Episode * 200 * exploration epsilon 0.1 * Episodic Reward is ==> -16.84507942199707
Episode * 201 * exploration epsilon 0.1 * Episodic Reward is ==> -14.915166854858398
Episode * 202 * exploration epsilon 0.1 * Episodic Reward is ==> -15.410022735595703
Episode * 203 * exploration epsilon 0.1 * Episodic Reward is ==> -17.91499900817871
Episode * 204 * exploration epsilon 0.1 * Episodic Reward is ==> -16.736223220825195
Episode * 205 * exploration epsilon 0.1 * Episodic Reward is ==> -16.607158660888672
Episode * 206 * exploration epsilon 0.1 * Episodic Reward is ==> -15.879140853881836
Episode * 207 * exploration epsilon 0.1 * Episodic Reward is ==> -17.5517635345459
Episode * 208 * exploration epsilon 0.1 * Episodic Reward is ==> -15.541314125061035
Episode * 209 * exploration epsilon 0.1 * Episodic Reward is ==> -18.054807662963867
Episode * 210 * exploration epsilon 0.1 * Episodic Reward is ==> -15.08384895324707
Episode * 211 * exploration epsilon 0.1 * Episodic Reward is ==> -15.9

Episode * 224 * exploration epsilon 0.1 * Episodic Reward is ==> -17.357616424560547
Episode * 225 * exploration epsilon 0.1 * Episodic Reward is ==> -16.284320831298828
Episode * 226 * exploration epsilon 0.1 * Episodic Reward is ==> -17.490570068359375
Episode * 227 * exploration epsilon 0.1 * Episodic Reward is ==> -17.420940399169922
Episode * 228 * exploration epsilon 0.1 * Episodic Reward is ==> -16.302509307861328
Episode * 229 * exploration epsilon 0.1 * Episodic Reward is ==> -15.150619506835938
Episode * 230 * exploration epsilon 0.1 * Episodic Reward is ==> -17.426910400390625
Episode * 231 * exploration epsilon 0.1 * Episodic Reward is ==> -16.229726791381836
Episode * 232 * exploration epsilon 0.1 * Episodic Reward is ==> -17.651538848876953
Episode * 233 * exploration epsilon 0.1 * Episodic Reward is ==> -16.541759490966797
Episode * 234 * exploration epsilon 0.1 * Episodic Reward is ==> -16.690689086914062
Episode * 235 * exploration epsilon 0.1 * Episodic Reward is ==> 

Episode * 248 * exploration epsilon 0.1 * Episodic Reward is ==> -14.962408065795898
Episode * 249 * exploration epsilon 0.1 * Episodic Reward is ==> -17.868289947509766
Episode * 250 * exploration epsilon 0.1 * Episodic Reward is ==> -16.443082809448242
Episode * 251 * exploration epsilon 0.1 * Episodic Reward is ==> -16.873844146728516
Episode * 252 * exploration epsilon 0.1 * Episodic Reward is ==> -17.11786651611328
Episode * 253 * exploration epsilon 0.1 * Episodic Reward is ==> -16.947025299072266
Episode * 254 * exploration epsilon 0.1 * Episodic Reward is ==> -18.069406509399414
Episode * 255 * exploration epsilon 0.1 * Episodic Reward is ==> -16.22552490234375
Episode * 256 * exploration epsilon 0.1 * Episodic Reward is ==> -16.839765548706055
Episode * 257 * exploration epsilon 0.1 * Episodic Reward is ==> -16.30531883239746
Episode * 258 * exploration epsilon 0.1 * Episodic Reward is ==> -18.103565216064453
Episode * 259 * exploration epsilon 0.1 * Episodic Reward is ==> -17

Episode * 272 * exploration epsilon 0.1 * Episodic Reward is ==> -14.934148788452148
Episode * 273 * exploration epsilon 0.1 * Episodic Reward is ==> -15.491436004638672
Episode * 274 * exploration epsilon 0.1 * Episodic Reward is ==> -16.6632137298584
Episode * 275 * exploration epsilon 0.1 * Episodic Reward is ==> -17.394386291503906
Episode * 276 * exploration epsilon 0.1 * Episodic Reward is ==> -15.860472679138184
Episode * 277 * exploration epsilon 0.1 * Episodic Reward is ==> -16.392240524291992
Episode * 278 * exploration epsilon 0.1 * Episodic Reward is ==> -15.534549713134766
Episode * 279 * exploration epsilon 0.1 * Episodic Reward is ==> -17.44216537475586
Episode * 280 * exploration epsilon 0.1 * Episodic Reward is ==> -16.154579162597656
Episode * 281 * exploration epsilon 0.1 * Episodic Reward is ==> -17.717487335205078
Episode * 282 * exploration epsilon 0.1 * Episodic Reward is ==> -18.179668426513672
Episode * 283 * exploration epsilon 0.1 * Episodic Reward is ==> -16

Episode * 296 * exploration epsilon 0.1 * Episodic Reward is ==> -16.90894317626953
Episode * 297 * exploration epsilon 0.1 * Episodic Reward is ==> -16.801502227783203
Episode * 298 * exploration epsilon 0.1 * Episodic Reward is ==> -17.248144149780273
Episode * 299 * exploration epsilon 0.1 * Episodic Reward is ==> -16.42922592163086
Episode * 300 * exploration epsilon 0.1 * Episodic Reward is ==> -16.993125915527344
Episode * 301 * exploration epsilon 0.1 * Episodic Reward is ==> -15.339662551879883
Episode * 302 * exploration epsilon 0.1 * Episodic Reward is ==> -17.059860229492188
Episode * 303 * exploration epsilon 0.1 * Episodic Reward is ==> -14.396623611450195
Episode * 304 * exploration epsilon 0.1 * Episodic Reward is ==> -17.142980575561523
Episode * 305 * exploration epsilon 0.1 * Episodic Reward is ==> -15.968297004699707
Episode * 306 * exploration epsilon 0.1 * Episodic Reward is ==> -15.544940948486328
Episode * 307 * exploration epsilon 0.1 * Episodic Reward is ==> -1

In [None]:
env.unflatten_observation(prev_state)

In [None]:
env.retinal_view_size, env.spectral_density_size, env.

In [None]:
episodic_reward

In [None]:
reward

In [None]:
episodic_reward

In [3]:
def epsilon_scheduler(episode, floor_episode=200, epsilon_floor=0.1):
    if episode < floor_episode:
        return 1.-(1.-epsilon_floor)*episode/floor_episode
    else:
        return epsilon_floor