In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

import numpy as np
import pandas as pd
import tensorflow as tf
import gym
import random

import os
import sys

import pickle

from sklearn.model_selection import KFold


from utils.epsilon_decay import linearly_decaying_epsilon
from models.box2d_models import DQNNetwork
from replay_buffers.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer

from default_config import DEFAULT_CONFIG as config

from dqn import DQNAgent
from qr_dqn import QuantileAgent

## SALE
- cross validator
- agent_1, ..., agent_K
- Q values updator
- advantage learner

### cross validator

In [11]:
class KFoldCV:
    def __init__(self, path, n_splits, shuffle=False, random_state=None):
        self.path = path
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        with open(path, 'rb') as f:
            self.trajs = pickle.load(f)
        self.kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        
    def split(self):
        dirname = os.path.abspath(os.path.dirname(self.path))
        
        agent_paths = [dirname + '/agent_{}/'.format(k) for k in range(self.n_splits)]
        for path in agent_paths:
            if not os.path.exists(path):
                os.mkdir(path)
                
        train_paths = [path+'/train/' for path in agent_paths]
        test_paths  = [path+'/test/'  for path in agent_paths]
        for k, (train_index, test_index) in enumerate(self.kf.split(self.trajs)):
            if not os.path.exists(train_paths[k]):
                os.mkdir(train_paths[k]) 
            train_trajs = [self.trajs[index] for index in train_index]
            with open(train_paths[k] + '/trajs_train_{}.pkl'.format(k), 'wb') as f:
                pickle.dump(train_trajs, f)
                
            if not os.path.exists(test_paths[k]):
                os.mkdir(test_paths[k])
            test_trajs  = [self.trajs[index] for index in test_index]
            with open(test_paths[k] + '/trajs_test_{}.pkl'.format(k), 'wb') as f:
                pickle.dump(test_trajs, f)
                
        self.agent_paths = agent_paths
        self.train_paths = train_paths
        self.test_paths = test_paths
        
    # agents, behavior cloning, density ratios
    def update_q(self, agents, bcs, density_ratios):
        states_, actions_, qvalues_ = [], [], []
        for k, (train_index, test_index) in enumerate(self.kf.split(self.trajs)):
            test_trajs = [self.trajs[index] for index in test_index]
            states = np.array([transition[0] for traj in test_trajs for transition in traj])
            actions = np.array([transition[1] for traj in test_trajs for transition in traj])
            rewards = np.array([transition[2] for traj in test_trajs for transition in traj])
            next_states = np.array([transition[3] for traj in test_trajs for transition in traj])
            
            q_vals = agents[k].model(states).q_values
            indices = tf.stack([tf.range(actions.shape[0]), actions], axis=-1)
            chosen_q_vals = tf.gather_nd(q_vals, indices=indices)
            next_vals = tf.math.reduce_max(agents[k].model(next_states).q_values, axis=1)
            td_errors = rewards + agents[k].config['gamma'] * next_vals - q_vals
            
            psocres = bcs[k].policy(states, actions)
            q_vals[range(len(actions)), actions] += (td_errors / (pscores + 1e-2)).clip(-100, 100)
            
            states_.append(states)
            actions_.append(actions)
            qvalues_.append(q_vals)
        
        states, actions, qvalues = np.vstack(states_), np.vstack(actions_), np.vstack(qvalues_)
        
        return [states, actions, qvalues]

In [12]:
path = '/data1/Prophet/sluo/projects/SALE/result/dqn/trajs_dqn_pr.pkl'

kf = KFoldCV(path, n_splits=2, shuffle=True, random_state=123456789)
kf.split()

In [14]:
kf.agent_paths, kf.train_paths, kf.test_paths

(['/data1/Prophet/sluo/projects/SALE/result/dqn/agent_0',
  '/data1/Prophet/sluo/projects/SALE/result/dqn/agent_1'],
 ['/data1/Prophet/sluo/projects/SALE/result/dqn/agent_0/train/',
  '/data1/Prophet/sluo/projects/SALE/result/dqn/agent_1/train/'],
 ['/data1/Prophet/sluo/projects/SALE/result/dqn/agent_0/test/',
  '/data1/Prophet/sluo/projects/SALE/result/dqn/agent_1/test/'])

## test one case

In [37]:
config['online'] = False
config['max_training_steps'] = 500000
config['lr'] = 1e-3
config['decay_steps'] = 1000000

config['persistent_directory'] = '/data1/Prophet/sluo/projects/SALE/result/dqn/agent/'
config['checkpoint_path'] = '/data1/Prophet/sluo/projects/SALE/result/dqn/agent/'

In [38]:
agent = QuantileAgent(name='LunarLander-v2', num_actions=4, config=config)



Loaded trajectories from load path: /data1/Prophet/sluo/projects/SALE/result/dqn/agent/trajs_dqn_pr.pkl!
Refresh buffer every 1000000 sampling!


In [None]:
agent.learn()

------------------------------------------------
timestep 1000
learning_rate 0.000999
mean reward (100 episodes) -440.588287
max reward (100 episodes) 3.351513
mean step (100 episodes) 122.600000
max step (100 episodes) 173.000000
------------------------------------------------
timestep 2000
learning_rate 0.000998
mean reward (100 episodes) -307.685373
max reward (100 episodes) 222.296944
mean step (100 episodes) 165.000000
max step (100 episodes) 314.000000
------------------------------------------------
timestep 3000
learning_rate 0.000997
mean reward (100 episodes) -274.649304
max reward (100 episodes) 222.296944
mean step (100 episodes) 177.866667
max step (100 episodes) 314.000000
------------------------------------------------
timestep 4000
learning_rate 0.000996
mean reward (100 episodes) -254.249604
max reward (100 episodes) 222.296944
mean step (100 episodes) 204.800000
max step (100 episodes) 350.000000
------------------------------------------------
timestep 5000
learnin

------------------------------------------------
timestep 35000
learning_rate 0.000966
mean reward (100 episodes) 75.340695
max reward (100 episodes) 275.506534
mean step (100 episodes) 394.280000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 36000
learning_rate 0.000965
mean reward (100 episodes) 76.606297
max reward (100 episodes) 275.506534
mean step (100 episodes) 366.690000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 37000
learning_rate 0.000964
mean reward (100 episodes) 74.045190
max reward (100 episodes) 275.506534
mean step (100 episodes) 342.700000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 38000
learning_rate 0.000963
mean reward (100 episodes) 80.720583
max reward (100 episodes) 275.506534
mean step (100 episodes) 323.280000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 39000
lear

saving model weights at /data1/Prophet/sluo/projects/SALE/result/dqn/agent/dqn_70000.ckpt
------------------------------------------------
timestep 70000
learning_rate 0.000935
mean reward (100 episodes) -81.535343
max reward (100 episodes) 75.444684
mean step (100 episodes) 88.550000
max step (100 episodes) 191.000000
------------------------------------------------
timestep 71000
learning_rate 0.000934
mean reward (100 episodes) -86.765133
max reward (100 episodes) 75.444684
mean step (100 episodes) 86.850000
max step (100 episodes) 191.000000
------------------------------------------------
timestep 72000
learning_rate 0.000933
mean reward (100 episodes) -91.795952
max reward (100 episodes) 75.444684
mean step (100 episodes) 85.250000
max step (100 episodes) 191.000000
------------------------------------------------
timestep 73000
learning_rate 0.000932
mean reward (100 episodes) -96.440340
max reward (100 episodes) 75.444684
mean step (100 episodes) 84.000000
max step (100 episode

------------------------------------------------
timestep 104000
learning_rate 0.000906
mean reward (100 episodes) 115.908320
max reward (100 episodes) 298.320503
mean step (100 episodes) 264.770000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 105000
learning_rate 0.000905
mean reward (100 episodes) 124.155426
max reward (100 episodes) 298.320503
mean step (100 episodes) 288.050000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 106000
learning_rate 0.000904
mean reward (100 episodes) 129.664459
max reward (100 episodes) 298.320503
mean step (100 episodes) 293.570000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 107000
learning_rate 0.000903
mean reward (100 episodes) 137.626328
max reward (100 episodes) 298.320503
mean step (100 episodes) 308.020000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 10

------------------------------------------------
timestep 138000
learning_rate 0.000879
mean reward (100 episodes) 110.105396
max reward (100 episodes) 298.904891
mean step (100 episodes) 565.730000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 139000
learning_rate 0.000878
mean reward (100 episodes) 110.119609
max reward (100 episodes) 298.904891
mean step (100 episodes) 570.910000
max step (100 episodes) 1000.000000
saving model weights at /data1/Prophet/sluo/projects/SALE/result/dqn/agent/dqn_140000.ckpt
------------------------------------------------
timestep 140000
learning_rate 0.000877
mean reward (100 episodes) 106.925118
max reward (100 episodes) 286.325140
mean step (100 episodes) 568.740000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 141000
learning_rate 0.000876
mean reward (100 episodes) 103.069714
max reward (100 episodes) 286.325140
mean step (100 episodes) 565.090000
max s

------------------------------------------------
timestep 172000
learning_rate 0.000853
mean reward (100 episodes) 54.019665
max reward (100 episodes) 288.536793
mean step (100 episodes) 545.230000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 173000
learning_rate 0.000853
mean reward (100 episodes) 55.812389
max reward (100 episodes) 288.536793
mean step (100 episodes) 545.980000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 174000
learning_rate 0.000852
mean reward (100 episodes) 56.063156
max reward (100 episodes) 288.536793
mean step (100 episodes) 543.460000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 175000
learning_rate 0.000851
mean reward (100 episodes) 50.991514
max reward (100 episodes) 288.536793
mean step (100 episodes) 556.280000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 176000

------------------------------------------------
timestep 206000
learning_rate 0.000829
mean reward (100 episodes) 176.920928
max reward (100 episodes) 316.000206
mean step (100 episodes) 510.180000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 207000
learning_rate 0.000829
mean reward (100 episodes) 181.243080
max reward (100 episodes) 316.000206
mean step (100 episodes) 483.900000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 208000
learning_rate 0.000828
mean reward (100 episodes) 180.457500
max reward (100 episodes) 301.838766
mean step (100 episodes) 489.410000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 209000
learning_rate 0.000827
mean reward (100 episodes) 182.323483
max reward (100 episodes) 301.838766
mean step (100 episodes) 483.190000
max step (100 episodes) 1000.000000
saving model weights at /data1/Prophet/sluo/projects/SALE/re

saving model weights at /data1/Prophet/sluo/projects/SALE/result/dqn/agent/dqn_240000.ckpt
------------------------------------------------
timestep 240000
learning_rate 0.000806
mean reward (100 episodes) 42.043888
max reward (100 episodes) 311.257263
mean step (100 episodes) 321.000000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 241000
learning_rate 0.000806
mean reward (100 episodes) 40.519939
max reward (100 episodes) 311.257263
mean step (100 episodes) 319.960000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 242000
learning_rate 0.000805
mean reward (100 episodes) 34.060829
max reward (100 episodes) 311.257263
mean step (100 episodes) 317.370000
max step (100 episodes) 1000.000000
------------------------------------------------
timestep 243000
learning_rate 0.000805
mean reward (100 episodes) 35.087178
max reward (100 episodes) 311.257263
mean step (100 episodes) 292.310000
max step 

In [None]:
rewards = pd.Series(agent.eval_episode_rewards)
steps = pd.Series(agent.eval_episode_steps)

fig, axes = plt.subplots(2, 2, figsize=(18, 8))

axes[0][0].plot(rewards.rolling(100, min_periods=20).mean())
axes[0][0].set_title('mean reward')
axes[0][1].plot(rewards.rolling(100, min_periods=20).max())
axes[0][1].set_title('max reward')
axes[1][0].plot(steps.rolling(100, min_periods=20).mean())
axes[1][0].set_title('mean step')
axes[1][1].plot(steps.rolling(100, min_periods=20).max())
axes[1][1].set_title('max step')

## agent_1, ..., agent_K

In [15]:
kf.agent_paths

['/data1/Prophet/sluo/projects/SALE/result/dqn/agent_0',
 '/data1/Prophet/sluo/projects/SALE/result/dqn/agent_1']

In [None]:
import copy

config['online'] = False
config['max_training_steps'] = 200000
config['lr'] = 5e-4
config['decay_steps'] = 100000

for idx in range(kf.n_splits):
    config_idx = copy.deepcopy(config)
    config_idx['persistent_directory'] = kf.train_paths[idx]
    config_idx['checkpoint_path'] = kf.agent_paths[idx]
    
    agent_idx = QuantileAgent(name='LunarLander-v2', num_actions=4, config=config_idx)
    agent_idx.learn()

## behavior cloning

In [None]:
bcs = []

for k in range(kf.n_splits):
    bc = BehaviorCloning(num_actions=agents[k].num_actions, verbose=1)
    bc.train(agents[k].replay_buffer)
    bcs.append(bc)

## Q values updator

In [None]:
states, actions, qvalues = kf.update_q(agents, bcs, None)

## advantage learner

In [None]:
adv_learner = AdvantageLearner()