In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

import numpy as np
import pandas as pd
import tensorflow as tf
import gym
import random
import os
import sys

from utils.epsilon_decay import linearly_decaying_epsilon
from models.box2d_models import DQNNetwork, MultiHeadQNetwork
from replay_buffers.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer

from multi_head_dqn import MultiHeadDQNAgent

## Online REM

In [28]:
agent = MultiHeadDQNAgent(name='LunarLander-v2',
                 network=MultiHeadQNetwork,
                 num_actions=4,
                 hiddens=[64,64], 
                 activation='relu',
                 num_heads=5,
                 num_convex_combinations=100,
                 double=True,
                 gamma=0.99,
                 # optimizers
                 optimizer=tf.keras.optimizers.Adam(
                     tf.keras.optimizers.schedules.InverseTimeDecay(5e-4, decay_steps=100000, decay_rate=1)
                 ),
                 # replay buffer
                 buffer_size=1000000,
                 min_replay_history=1000,                 
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 online=True,
                 persistent_directory='./trajs/multi_head_dqn/test/',
                 episode_counts_to_save=100,
                 sample_steps_to_refresh=10000,
                 # training params
                 max_training_steps=200000,
                 training_steps_to_eval=1000,
                 batch_size=64,
                 max_episode_steps=1000,
                 reward_clip=200,
                 grad_clip=40,
                 # stopping criteria
                 target_mean_episode_reward=500,
                 # target model update params
                 tau=0.999,
                 update_period=1,
                 target_update_period=1,
                 # exploration params
                 epsilon_fn=linearly_decaying_epsilon,   
                 epsilon_start=0.1,
                 epsilon_decay_period=200000,
                 epsilon_end=0.1,
                 eval_mode=False,
                 epsilon_eval=0.001)

Save buffer every 100 episodes!




In [29]:
agent.env.spec.max_episode_steps

1000

In [None]:
agent.learn()

------------------------------------------------
episodes 11
timestep 1000
exploration 0.100000
learning_rate 0.000495
mean reward (100 episodes) -1232.616104
max reward (100 episodes) -240.500045
mean step (100 episodes) 164.600000
max step (100 episodes) 272.000000
------------------------------------------------
episodes 26
timestep 2000
exploration 0.100000
learning_rate 0.000490
mean reward (100 episodes) -691.820608
max reward (100 episodes) -123.150411
mean step (100 episodes) 120.000000
max step (100 episodes) 272.000000
------------------------------------------------
episodes 37
timestep 3000
exploration 0.100000
learning_rate 0.000485
mean reward (100 episodes) -666.565548
max reward (100 episodes) -123.150411
mean step (100 episodes) 272.200000
max step (100 episodes) 747.000000
------------------------------------------------
episodes 39
timestep 4000
exploration 0.100000
learning_rate 0.000481
mean reward (100 episodes) -536.344680
max reward (100 episodes) -52.943780
mea

------------------------------------------------
episodes 104
timestep 32000
exploration 0.100000
learning_rate 0.000379
mean reward (100 episodes) -112.946332
max reward (100 episodes) 211.704709
mean step (100 episodes) 719.930000
max step (100 episodes) 1000.000000
------------------------------------------------
episodes 106
timestep 33000
exploration 0.100000
learning_rate 0.000376
mean reward (100 episodes) -104.357434
max reward (100 episodes) 211.704709
mean step (100 episodes) 723.340000
max step (100 episodes) 1000.000000
------------------------------------------------
episodes 108
timestep 34000
exploration 0.100000
learning_rate 0.000373
mean reward (100 episodes) -90.438301
max reward (100 episodes) 211.704709
mean step (100 episodes) 730.780000
max step (100 episodes) 1000.000000
------------------------------------------------
episodes 110
timestep 35000
exploration 0.100000
learning_rate 0.000370
mean reward (100 episodes) -85.188464
max reward (100 episodes) 211.70470

------------------------------------------------
episodes 175
timestep 63000
exploration 0.100000
learning_rate 0.000307
mean reward (100 episodes) -14.639735
max reward (100 episodes) 271.225236
mean step (100 episodes) 711.360000
max step (100 episodes) 1000.000000
------------------------------------------------
episodes 177
timestep 64000
exploration 0.100000
learning_rate 0.000305
mean reward (100 episodes) -14.203390
max reward (100 episodes) 271.225236
mean step (100 episodes) 725.620000
max step (100 episodes) 1000.000000
------------------------------------------------
episodes 179
timestep 65000
exploration 0.100000
learning_rate 0.000303
mean reward (100 episodes) -15.546680
max reward (100 episodes) 271.225236
mean step (100 episodes) 745.680000
max step (100 episodes) 1000.000000
------------------------------------------------
episodes 181
timestep 66000
exploration 0.100000
learning_rate 0.000301
mean reward (100 episodes) -24.874921
max reward (100 episodes) 271.225236


In [None]:
rewards = pd.Series(agent.eval_episode_rewards)
steps = pd.Series(agent.eval_episode_steps)

fig, axes = plt.subplots(2, 2, figsize=(18, 8))

axes[0][0].plot(rewards.rolling(100, min_periods=20).mean())
axes[0][0].set_title('mean reward')
axes[0][1].plot(rewards.rolling(100, min_periods=20).max())
axes[0][1].set_title('max reward')
axes[1][0].plot(steps.rolling(100, min_periods=20).mean())
axes[1][0].set_title('mean step')
axes[1][1].plot(steps.rolling(100, min_periods=20).max())
axes[1][1].set_title('max step')

In [None]:
agent._eval(100)

## Offline REM

In [None]:
lr = tf.keras.optimizers.schedules.InverseTimeDecay(5e-5, decay_steps=100000, decay_rate=1)
optimizer = tf.keras.optimizers.Adam(lr)

agent = MultiHeadDQNAgent(name='LunarLander-v2',
                         num_actions=4,
                         hiddens=[64,64], 
                         activation='relu',
                         num_heads=100,
                         num_convex_combinations=100,
                         gamma=0.99,                 
                         # optimizers
                         optimizer=optimizer,
                         # replay buffer
                         buffer_size=100000,
                         min_replay_history=1000,                 
                         prioritized_replay=False,
                         prioritized_replay_alpha=0.6,
                         prioritized_replay_beta=0.4,
                         online=False,
                         persistent_directory='./trajs/multi_head_dqn',
                         episode_counts_to_save=100,
                         sample_steps_to_refresh=500,
                         # training params
                         max_training_steps=500000,
                         training_steps_to_eval=1000,
                         batch_size=64,
                         max_episode_steps=1000,
                         # target model update params
                         tau=0.999,
                         update_period=1,
                         target_update_period=1,
                         # exploration params
                         epsilon_fn=linearly_decaying_epsilon,        
                         epsilon_start=0.1,
                         epsilon_decay_period=100000,
                         epsilon_end=0.1,
                         eval_mode=False,
                         epsilon_eval=0.001)

In [None]:
import numpy as np
import tensorflow as tf

class WeightNetwork(tf.keras.Model):
    def __init__(self, num_actions, 
                 hiddens=[16, 16], 
                 activation='relu', 
                 name='weight'):
        super().__init__(name=name)
        
        self.num_actions = num_actions
        self.hiddens = hiddens
        self.activation = activation
        # defining layers
        self.dense_layers = [tf.keras.layers.Dense(units=hidden, activation=activation)
                             for hidden in hiddens]
        self.out = tf.keras.layers.Dense(units=1, activation=None)
        
    def call(self, states, actions, future_states):
        one_hot_actions = tf.one_hot(actions, depth=self.num_actions, 
                                     on_value=1.0, off_value=0.0)
        x = tf.concat([states, one_hot_actions, future_states], axis=-1)
        for dense in self.dense_layers:
            x = dense(x)
        return self.out(x)

class VisitationRatioModel:
    def __init__(self, model, optimizer, replay_buffer,
                 target_policy, behavior_policy, medians=None):
        self.model = model
        self.optimizer = optimizer
        self.replay_buffer = replay_buffer
        self.target_policy = target_policy
        self.behavior_policy = behavior_policy
        self.medians = medians
        self.losses = []

    def _compute_medians(self, n=100):
        transitions = self.replay_buffer.sample(n)
        states, actions, next_states = transitions[0], transitions[1], transitions[3]
        mat = tf.concat([states, actions[:,np.newaxis], next_states], axis=-1)  # n x ...
        dxx = tf.repeat(mat, n, axis=0) - tf.tile(mat, [n, 1])                  # n2 x ...
        medians = np.median(tf.math.abs(dxx), axis=0) + 1e-2 # p
        return medians
    
    def _normalize(self, weights, batch_size):
        weights = tf.reshape(weights, [batch_size, batch_size])
        weights_sum = tf.math.reduce_sum(weights, axis=1, keepdims=True) + 1e-6
        weights = weights / weights_sum
        return tf.reshape(weights, [batch_size**2])
        
    def _compute_loss(self, states, actions, next_states, gamma):
        batch_size = states.shape[0]
        
        states_r = tf.repeat(states, batch_size, axis=0)      # n2 x ...
        actions_r = tf.repeat(actions, batch_size)            # n2 
        states_t = tf.tile(states, [batch_size, 1])           # n2 x ...
        next_states_t = tf.tile(next_states, [batch_size, 1]) # n2 x ...
        
        ### state visitation ratios & policy ratios & deltas
        weights = self.model(states_r, actions_r, states_t)           # n2
        next_weights = self.model(states_r, actions_r, next_states_t) # n2
        weights = self._normalize(weights, batch_size)                # n2
        next_weights = self._normalize(next_weights, batch_size)      # n2 
        policy_ratios = self.target_policy(states, actions) / (self.behavior_policy(states, actions)+1e-3) # n
        policy_ratios = tf.tile(policy_ratios, [batch_size])          # n2
        policy_ratios = tf.cast(policy_ratios, weights.dtype)
        deltas = gamma * weights * policy_ratios - next_weights       # n2
        
        ### kernels
        actions_r = tf.cast(actions_r, states.dtype)
        mat1 = tf.concat([states, actions[:,None], next_states], axis=-1)        # n x ...
        mat2 = tf.concat([states_r, actions_r[:,None], next_states_t], axis=-1)  # n2 x ...
        dxx1 = tf.repeat(mat2, batch_size**2, axis=0) - tf.tile(mat2, [batch_size**2, 1]) # n4 x ...
        dxx2 = tf.repeat(mat1, batch_size**2, axis=0) - tf.tile(mat2, [batch_size, 1])    # n3 x ...
        dxx3 = tf.repeat(mat1, batch_size, axis=0)    - tf.tile(mat1, [batch_size, 1])    # n2 x ...
        dxx1 = tf.exp(-tf.math.reduce_sum(tf.math.abs(dxx1)/self.medians, axis=-1)) # n4
        dxx2 = tf.exp(-tf.math.reduce_sum(tf.math.abs(dxx2)/self.medians, axis=-1)) # n3
        dxx3 = tf.exp(-tf.math.reduce_sum(tf.math.abs(dxx3)/self.medians, axis=-1)) # n2
        
        ### final loss
        dxx1 = tf.repeat(deltas, batch_size**2) * tf.tile(deltas, [batch_size**2]) * dxx1
        dxx2 = tf.tile(deltas, [batch_size]) * dxx2
        loss = tf.reduce_sum(dxx1)/batch_size**4 + \
               2*(1-gamma)*tf.reduce_sum(dxx2)/batch_size**3 + \
               (1-gamma)**2*tf.reduce_sum(dxx3)/batch_size**2
        
        return loss
        
    def fit(self, batch_size=32, gamma=0.99, max_iter=100):
        if self.medians is None:
            self.medians = self._compute_medians()
            
        for i in range(max_iter):
            transitions = self.replay_buffer.sample(batch_size)
            states, actions, next_states = transitions[0], transitions[1], transitions[3]
            ##### compute loss function #####
            with tf.GradientTape() as tape:
                loss = self._compute_loss(states, actions, next_states, gamma)
            dw = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(dw, self.model.trainable_variables))
            
            self.losses.append(loss.numpy())

In [None]:
def behavior_policy(states, actions):
    return 0.25*tf.ones(len(states))

def target_policy(states, actions):
    actions_ = np.argmax(agent.model(states).q_values, axis=1)
    return (actions == actions_).astype(np.int)

In [None]:
model = WeightNetwork(4)
lr = tf.keras.optimizers.schedules.InverseTimeDecay(1e-3, decay_steps=100000, decay_rate=1)
optimizer = tf.keras.optimizers.Adam(lr)

vrk = VisitationRatioModel(model, optimizer, agent.replay_buffer, target_policy, behavior_policy)

In [None]:
transitions = agent.replay_buffer.sample(32)
states, actions, next_states = transitions[0], transitions[1], transitions[3]

In [None]:
vrk.fit(batch_size=32, gamma=0.99, max_iter=100)

In [None]:
losses = [loss.numpy() for loss in vrk.losses]