In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4'  # or any {'0', '1', '2'}
from imagenet_dataset import get_dataset
from retina_env import RetinaEnv, calculate_retinal_filter
from rl_networks import create_actor_model, create_critic_model, policy
from rl_core import Buffer, update_target
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import types
config = types.SimpleNamespace()
config.batch_size = 32
config.margin = 20
config.image_h = 224
config.image_w = 224
config.image_hm = config.image_h+2*config.margin
config.image_wm = config.image_w+2*config.margin
config.foveate = None
config.do_grayscale = True
config.history_length = 16
config.t_ignore = 16
config.t_max =50
config.motion_mode = 'velocity'
config.use_dones = True

config.gym_mode = False
t_vec = np.linspace(0,150,16)

balanced_filter = calculate_retinal_filter(t_vec, R=1.0)
config.filter = balanced_filter.reshape([1,1,-1,1])
config.min_freq = 1
config.max_freq = 13
config.action_upper_bound = np.array([2.0, 2.0])

dataset_dir = '/home/bnapp/datasets/tensorflow_datasets/imagenet2012/5.0.0/'

def epsilon_scheduler(episode, floor_episode=200, epsilon_floor=0.1):
    if episode < floor_episode:
        return 1.-(1.-epsilon_floor)*episode/floor_episode
    else:
        return epsilon_floor

dataset = get_dataset(dataset_dir,
                                     'validation',
                                     config.batch_size,
                                     image_h = config.image_hm,
                                     image_w = config.image_wm,
                                     preprocessing='identity',
                                     rggb_mode=False,
                                     central_squeeze_and_pad_factor=-1)

env = RetinaEnv(config, image_generator=dataset)

if config.gym_mode:
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    upper_bound = env.action_space.high[0]
    lower_bound = env.action_space.low[0]
else:
    num_states = env.observation_size
    num_actions = env.action_size
    upper_bound = env.action_upper_bound
    lower_bound = env.action_lower_bound

# You might want to adjust the hyperparameters
actor_lr = 0.0001
critic_lr = 0.001
gamma = 15./16.
tau = 0.005

buffer_capacity = 10000



# Create actor and critic networks
actor_model = create_actor_model(env.image_h, env.image_w, env.spectral_density_size, env.location_history_size, env.timestep_size, env.action_size)
critic_model = create_critic_model(env.image_h, env.image_w, env.spectral_density_size, env.location_history_size, env.timestep_size, env.action_size)

# Create target actor and critic networks
target_actor = create_actor_model(env.image_h, env.image_w, env.spectral_density_size, env.location_history_size, env.timestep_size, env.action_size)
target_critic = create_critic_model(env.image_h, env.image_w, env.spectral_density_size, env.location_history_size, env.timestep_size, env.action_size)

# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

# Learning rate for actor-critic models
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

# Experience replay buffer
buffer = Buffer(buffer_capacity, config.batch_size, num_states=num_states, num_actions=num_actions,
                state_reshape_fn=env.unflatten_observation, use_dones=config.use_dones)

 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Instructions for updating:
Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by `tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data optimizations will take care of using the fused implementation.


In [None]:
# Training loop
reward_records = []
epsilon_records = []
action_mean_records = []
action_var_records = []
episodes = 10000
for ep in range(episodes):
    prev_state = env.reset()
    episodic_reward = 0
    epsilon = epsilon_scheduler(ep, floor_episode=1000)

    while True:
        # tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        
        deterministic_action = policy(env.unflatten_observation(prev_state), actor_model, lower_bound, upper_bound)
        random_action = -2+4*np.random.uniform(size=(config.batch_size,2))

        if env.warmup_done:
            action = epsilon*random_action + (1-epsilon)*deterministic_action
        else:
            action = random_action

        # Recieve state and reward from environment
        state, reward, done, info = env.step(action)
        
        if env.warmup_done:
            if config.use_dones:
                buffer.record((prev_state, action, reward, state, done))
            else:   
                buffer.record((prev_state, action, reward, state))
                
            episodic_reward += reward

            buffer.learn(actor_model, target_actor, critic_model, target_critic, actor_optimizer, critic_optimizer, gamma, tau)
            update_target(target_actor.variables, actor_model.variables, tau)
            update_target(target_critic.variables, critic_model.variables, tau)
            
            action_mean_records.append(deterministic_action.numpy().mean(axis=0))
            action_var_records.append(deterministic_action.numpy().var(axis=0))
        # End this episode when `done` is True
        if done:
            break

        #prev_state = state avoid assingment by reference:
        prev_state = np.copy(state)
#         print('debug action: ', action[0])

    print(f"Episode * {ep} * exploration epsilon {epsilon} * Episodic Reward is ==> {episodic_reward.numpy().mean()}")
    print("action means and variances at step -10:", action_mean_records[-10],action_var_records[-10])
    print("action means and variances at step -5:", action_mean_records[-5],action_var_records[-5])
    reward_records.append(episodic_reward.numpy().mean())
    epsilon_records.append(epsilon)

Episode * 0 * exploration epsilon 1.0 * Episodic Reward is ==> -10.267127990722656
action means and variances at step -10: [-0.29886293 -0.17157948] [3.5023353e-05 3.9776714e-05]
action means and variances at step -5: [-0.33278316 -0.23370846] [3.8726874e-05 6.1590814e-05]
Episode * 1 * exploration epsilon 0.9991 * Episodic Reward is ==> -12.52334213256836
action means and variances at step -10: [-0.43175566 -0.48382324] [1.0600996e-04 8.6193380e-05]
action means and variances at step -5: [-0.4351316  -0.52111036] [1.1364900e-04 7.0178554e-05]
Episode * 2 * exploration epsilon 0.9982 * Episodic Reward is ==> -11.890865325927734
action means and variances at step -10: [-0.50586444 -0.6570226 ] [0.00020049 0.00015529]
action means and variances at step -5: [-0.5121412 -0.6762837] [0.00017603 0.00013429]
Episode * 3 * exploration epsilon 0.9973 * Episodic Reward is ==> -11.537487030029297
action means and variances at step -10: [-0.5765384 -0.7722436] [0.00018495 0.00022735]
action means 

Episode * 16 * exploration epsilon 0.9856 * Episodic Reward is ==> -11.262582778930664
action means and variances at step -10: [-1.5837533 -2.       ] [0.00052653 0.        ]
action means and variances at step -5: [-1.6016713 -2.       ] [0.00039591 0.        ]
Episode * 17 * exploration epsilon 0.9847 * Episodic Reward is ==> -11.720897674560547
action means and variances at step -10: [-1.6446528 -2.       ] [0.00051649 0.        ]
action means and variances at step -5: [-1.6431963 -2.       ] [0.00044367 0.        ]
Episode * 18 * exploration epsilon 0.9838 * Episodic Reward is ==> -12.152669906616211
action means and variances at step -10: [-1.6418291 -2.       ] [0.0002905 0.       ]
action means and variances at step -5: [-1.6648573 -2.       ] [0.00030846 0.        ]
Episode * 19 * exploration epsilon 0.9829 * Episodic Reward is ==> -10.948974609375
action means and variances at step -10: [-1.7986763 -2.       ] [0.00051524 0.        ]
action means and variances at step -5: [-1.8

Episode * 33 * exploration epsilon 0.9703 * Episodic Reward is ==> -12.1904878616333
action means and variances at step -10: [-2. -2.] [0. 0.]
action means and variances at step -5: [-2. -2.] [0. 0.]
Episode * 34 * exploration epsilon 0.9694 * Episodic Reward is ==> -12.689095497131348
action means and variances at step -10: [-2. -2.] [0. 0.]
action means and variances at step -5: [-2. -2.] [0. 0.]
Episode * 35 * exploration epsilon 0.9685 * Episodic Reward is ==> -9.759008407592773
action means and variances at step -10: [-2. -2.] [0. 0.]
action means and variances at step -5: [-2. -2.] [0. 0.]
Episode * 36 * exploration epsilon 0.9676 * Episodic Reward is ==> -12.093253135681152
action means and variances at step -10: [-2. -2.] [0. 0.]
action means and variances at step -5: [-2. -2.] [0. 0.]
Episode * 37 * exploration epsilon 0.9667 * Episodic Reward is ==> -10.820388793945312
action means and variances at step -10: [-2. -2.] [0. 0.]
action means and variances at step -5: [-2. -2.] [

In [None]:
plt.plot(reward_records)

In [None]:
import misc

In [None]:
plt.plot(reward_records)
plt.plot(misc.smooth(reward_records,100))
plt.grid()


In [None]:
plt.plot(reward_records)
plt.plot(misc.smooth(reward_records,1000))
plt.grid()

In [None]:
plt.plot(buffer.critic_loss_buffer[100:])
plt.plot(misc.smooth(buffer.critic_loss_buffer[100:],100))

In [None]:
plt.plot(buffer.critic_loss_buffer[:10])


In [None]:
plt.plot(misc.smooth(buffer.critic_loss_buffer[100:],1))
plt.plot(misc.smooth(buffer.critic_loss_buffer[100:],100))
plt.plot(misc.smooth(buffer.critic_loss_buffer[100:],1000))
plt.ylim([0,0.05])
plt.grid()

In [None]:
plt.plot(misc.smooth(buffer.critic_loss_buffer[-500:],1))
plt.plot(misc.smooth(buffer.critic_loss_buffer[-500:],100))
plt.plot(misc.smooth(buffer.critic_loss_buffer[-500:],1000))
plt.grid()

In [None]:
plt.plot(buffer.actor_loss_buffer[100:])


In [None]:
deterministic_action.numpy()

In [None]:
deterministic_action.numpy().var(axis=0)

In [None]:
env.timestep

In [None]:
state, reward, done, info = env.step(action)


In [None]:
deterministic_action2 = policy(env.unflatten_observation(prev_state), actor_model, lower_bound, upper_bound)


In [None]:
deterministic_action2.numpy()

In [None]:
policy(env.unflatten_observation(0.0*prev_state), actor_model, lower_bound, upper_bound)

In [None]:
for k_state in range(-3,4):
    for k_act in range(-3,4):
        print(k_state,k_act,critic_model([env.unflatten_observation(k_state*prev_state),
                      k_act*deterministic_action]).numpy()[:5].T)

In [None]:
for k_state in range(-3,4):
    for k_act in range(-3,4):
        print(k_state,k_act,critic_model([env.unflatten_observation(k_state*state),
                      k_act*deterministic_action]).numpy()[:5].T)

In [None]:
print('----------')
print(critic_model([env.unflatten_observation(state),
                      deterministic_action]).numpy()[:5].T)
print(critic_model([env.unflatten_observation(state),
                      -deterministic_action]).numpy()[:5].T)

In [None]:
for ii in range(15):
    state, reward, done, info = env.step(action)
    print('----------')
    print(reward.numpy()[:5])
    print(critic_model([env.unflatten_observation(state),
                          deterministic_action]).numpy()[:5].T)
    print(critic_model([env.unflatten_observation(state),
                          -deterministic_action]).numpy()[:5].T)

In [None]:
reward.numpy()[:5]

In [None]:
state[-1]

In [None]:
env.reset()
for ii in range(55):
    random_action = -2+4*np.random.uniform(size=(config.batch_size,2))
    state, reward, done, info = env.step(random_action)
    print('----------')
    print(reward.numpy()[:5])
    print(critic_model([env.unflatten_observation(state),
                          deterministic_action]).numpy()[:5].T)