In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

import numpy as np
import pandas as pd
import tensorflow as tf
import gym
import random

import os
import sys

import pickle

from sklearn.model_selection import KFold


from utils.epsilon_decay import linearly_decaying_epsilon
from models.box2d_models import DQNNetwork
from replay_buffers.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer

from dqn import DQNAgent

## Online DQN

In [None]:
agent = DQNAgent(name='LunarLander-v2',
                 # models
                 network=DQNNetwork,
                 num_actions=4,
                 hiddens=[64,64],
                 activation='relu',
                 dueling=False,
                 double=True,
                 gamma=0.99,
                 # optimizers
                 optimizer=tf.keras.optimizers.Adam(
                     tf.keras.optimizers.schedules.InverseTimeDecay(5e-4, decay_steps=100000, decay_rate=1)
                 ),
                 # replay buffer
                 buffer_size=100000,
                 min_replay_history=1000,               
                 prioritized_replay=True,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 online=True,
                 persistent_directory='./trajs/dqn/test/',
                 episode_counts_to_save=100,
                 sample_steps_to_refresh=500,
                 # training params
                 max_training_steps=1000000,
                 training_steps_to_eval=1000,
                 batch_size=64,
                 max_episode_steps=1000,
                 # target model update params
                 tau=0.999,
                 update_period=1,
                 target_update_period=1,
                 # exploration params
                 epsilon_fn=linearly_decaying_epsilon,                 
                 epsilon_start=0.1,
                 epsilon_decay_period=100000,
                 epsilon_end=0.1,
                 eval_mode=False,
                 epsilon_eval=0.001)

In [None]:
agent.learn()

In [None]:
rewards = pd.Series(agent.eval_episode_rewards)
steps = pd.Series(agent.eval_episode_steps)

fig, axes = plt.subplots(2, 2, figsize=(18, 8))

axes[0][0].plot(rewards.rolling(100, min_periods=20).mean())
axes[0][0].set_title('mean reward')
axes[0][1].plot(rewards.rolling(100, min_periods=20).max())
axes[0][1].set_title('max reward')
axes[1][0].plot(steps.rolling(100, min_periods=20).mean())
axes[1][0].set_title('mean step')
axes[1][1].plot(steps.rolling(100, min_periods=20).max())
axes[1][1].set_title('max step')

In [None]:
agent._eval(100)

In [None]:
agent.save('./saved/ddqn')

In [None]:
agent = DQNAgent(name='LunarLander-v2',
                 # models
                 network=DQNNetwork,
                 num_actions=4,
                 hiddens=[64,64],
                 activation='relu',
                 dueling=False,
                 double=True,
                 gamma=0.99,
                 # optimizers
                 optimizer=tf.keras.optimizers.Adam(
                     tf.keras.optimizers.schedules.InverseTimeDecay(5e-4, decay_steps=100000, decay_rate=1)
                 ),
                 # replay buffer
                 buffer_size=100000,
                 min_replay_history=1000,               
                 prioritized_replay=True,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 online=True,
                 persistent_directory='./trajs/dqn/test/',
                 episode_counts_to_save=100,
                 sample_steps_to_refresh=500,
                 # training params
                 max_training_steps=1000000,
                 training_steps_to_eval=1000,
                 batch_size=64,
                 max_episode_steps=1000,
                 # target model update params
                 tau=0.999,
                 update_period=1,
                 target_update_period=1,
                 # exploration params
                 epsilon_fn=linearly_decaying_epsilon,                 
                 epsilon_start=0.1,
                 epsilon_decay_period=100000,
                 epsilon_end=0.1,
                 eval_mode=False,
                 epsilon_eval=0.001)

agent.learn()

In [None]:
rewards = pd.Series(agent.eval_episode_rewards)
steps = pd.Series(agent.eval_episode_steps)

fig, axes = plt.subplots(2, 2, figsize=(18, 8))

axes[0][0].plot(rewards.rolling(100, min_periods=20).mean())
axes[0][0].set_title('mean reward')
axes[0][1].plot(rewards.rolling(100, min_periods=20).max())
axes[0][1].set_title('max reward')
axes[1][0].plot(steps.rolling(100, min_periods=20).mean())
axes[1][0].set_title('mean step')
axes[1][1].plot(steps.rolling(100, min_periods=20).max())
axes[1][1].set_title('max step')

In [None]:
# persistent_directory = './trajs/dqn/online/'
# files = os.listdir(persistent_directory)
# files = sorted([file for file in files if file.endswith('.pkl')])

# import pickle

# trajs = []
# for file in files:
#     path = persistent_directory + file
#     with open(path, 'rb') as f:
#         trajs.append(pickle.load(f))
        
# trajs = [traj for file in trajs for traj in file] 
# # random.shuffle(trajs)

# with open('./trajs/dqn/offline/trajs_dqn.pkl', 'wb') as f:
#     pickle.dump(trajs, f)

# Offline DQN

In [None]:
lr = tf.keras.optimizers.schedules.InverseTimeDecay(5e-4, decay_steps=100000, decay_rate=1)
optimizer = tf.keras.optimizers.Adam(lr)

agent = DQNAgent(name='LunarLander-v2',
                 num_actions=4,
                 hiddens=[64,64],
                 activation='relu',
                 dueling=False,
                 double=True,
                 gamma=0.99,
                 # optimizers
                 optimizer=optimizer,
                 # replay buffer
                 buffer_size=10000,
                 min_replay_history=1000,           
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 online=False,
                 persistent_directory='./trajs/dqn/offline/',
                 episode_counts_to_save=100,
                 sample_steps_to_refresh=1000000,
                 # training params
                 max_training_steps=500000,
                 training_steps_to_eval=1000,
                 batch_size=64,
                 max_episode_steps=1000,
                 # target model update params
                 tau=0.999,
                 update_period=1,
                 target_update_period=1,
                 # exploration params
                 epsilon_fn=linearly_decaying_epsilon,             
                 epsilon_start=0.1,
                 epsilon_decay_period=100000,
                 epsilon_end=0.1,
                 eval_mode=False,
                 epsilon_eval=0.001)

In [None]:
agent.learn()

In [None]:
rewards = pd.Series(agent.eval_episode_rewards)
steps = pd.Series(agent.eval_episode_steps)

fig, axes = plt.subplots(2, 2, figsize=(18, 8))

axes[0][0].plot(rewards.rolling(100, min_periods=20).mean())
axes[0][0].set_title('mean reward')
axes[0][1].plot(rewards.rolling(100, min_periods=20).max())
axes[0][1].set_title('max reward')
axes[1][0].plot(steps.rolling(100, min_periods=20).mean())
axes[1][0].set_title('mean step')
axes[1][1].plot(steps.rolling(100, min_periods=20).max())
axes[1][1].set_title('max step')