In [6]:
import os
import sys
sys.path.append('/'.join(os.getcwd().split('/')[:-1]))

import pathlib
import pickle

from Agents import QLearningAgent, BayesianQAgent, PSRLAgent, MomentMatchingAgent, UbeNoUnrollAgent
from Environments import DeepSea, WideNarrow, PriorMDP

from tqdm import tqdm_notebook as tqdm

import numpy as np
import matplotlib.pyplot as plt

TabError: inconsistent use of tabs and spaces in indentation (Agents.py, line 89)

In [21]:
import matplotlib

matplotlib.rcParams['figure.figsize'] = (8.0, 6.0)

plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)
plt.rc('legend', fontsize=16)
plt.rc('figure', titlesize=50)

figsave_loc = '/big/download_figures/QL-epsilon-greedy/'
method = 'Q-Learning'
!mkdir /big/download_figures/QL-epsilon-greedy

mkdir: cannot create directory ‘/big/download_figures/QL-epsilon-greedy’: File exists


# Experiment helper

In [22]:
def run_offline_experiment(environment, pi, num_offline_frames, seed, num_oracle_iter):
    
    np.random.seed(seed)
    
    # Save current environment state
    s0, t0 = environment.s, environment.t
    
    # Initial state
    environment.reset()
    s, t = 0, 0
    
    states, actions, rewards, states_ = [], [], [], []
    
    for n in range(num_offline_frames):
        
        a = pi[s]
        
        # Step environment
        s_, r, t, done = environment.step(a)
        
        states.append(s)
        actions.append(a)
        rewards.append(r)
        states_.append(s_)
        
        s = s_
        
    # Initial state
    environment.reset()
    s, t = 0, 0
    
    agent_sars_ = np.array(states), np.array(actions), np.array(rewards), np.array(states_)
    
    oracle_sars_ = run_oracle_experiment(environment,
                                         seed,
                                         gamma=agent.gamma, 
                                         num_iter=num_oracle_iter,
                                         num_episodes=1,
                                         num_frames_per_episode=num_offline_frames-1)
    
    offline_regret = np.sum((oracle_sars_[2] - agent_sars_[2]))
    
    # Restore environment state
    environment.s, environment.t = s0, t0
    
    return [offline_regret, oracle_sars_, agent_sars_]

In [23]:
def run_experiment(environment,
                   agent,
                   seed,
                   num_episodes,
                   num_frames_per_episode,
                   save_every,
                   num_offline_frames,
                   num_oracle_iter,
                   max_buffer_length=0):
    
    np.random.seed(seed)
    
    # Initial state
    environment.reset()
    s, t = 0, 0

    # Save location for agent
    save_loc = '/big/tabular_results/{}/{}/seed-{}/'.format(environment.get_name(), agent.get_name(), seed)
    pathlib.Path(save_loc).mkdir(parents=True, exist_ok=True)
    
    for n in range(num_episodes):
        for i in range(num_frames_per_episode + 1):
            
            # Take action
            a = agent.take_action(s, t)

            # Step environment
            s_, r, t, done = environment.step(a)

            # Update agent
            agent.observe([t, s, a, r, s_])
            agent.update_after_step(max_buffer_length)

            # Update current state (for agent)
            s = s_
            
            if i % save_every == 0:
                result = run_offline_experiment(environment,
                                                agent.get_greedy_policy(),
                                                num_offline_frames,
                                                np.random.randint(low=0, high=int(1e6)),
                                                num_oracle_iter)
                
                agent.offline_regret = result[0]
                agent.agent_sars_ = result[1]
                agent.oracle_sars_ = result[2]
                
                agent.save_copy(save_loc, '{}_{}'.format(n, i))

In [24]:
def run_oracle_experiment(environment, seed, gamma, num_iter, num_episodes, num_frames_per_episode):
    
    np.random.seed(seed * 10)
    
    # Initial state
    environment.reset()
    s, t = 0, 0
    
    pi, Q = environment.get_optimal_policy(gamma=gamma, num_iter=num_iter)
    
    states, actions, rewards, states_ = [], [], [], []
    
    for n in range(num_episodes):
        for i in range(num_frames_per_episode + 1):
            
            # Take action
            a = pi[s]

            # Step environment
            s_, r, t, done = environment.step(a)

            states.append(s)
            actions.append(a)
            rewards.append(r)
            states_.append(s_)

            # Update current state (for agent)
            s = s_
        
    return np.array(states), np.array(actions), np.array(rewards), np.array(states_)

In [25]:
def load_agent(environment, agent, seed, episode, frame):

    # Load location
    load_name = '/big/tabular_results/{}/{}/seed-{}/chkpt_{}_{}'.format(environment.get_name(),
                                                                        agent.get_name(),
                                                                        seed,
                                                                        episode,
                                                                        frame)
    
    # Load the agent
    fhandle = open(load_name, 'rb')
    agent = pickle.load(fhandle)
    
    return agent

# DeepSea

In [19]:
for anneal_timescale in tqdm([10000, float('inf')]):
    for dither_param in tqdm([1.0]):
        for L in tqdm([4, 8, 12, 16, 20]):
            for seed in tqdm(range(10)):

                # Environment constants
                rew_params = ((0., 0.), (-1e-1 * np.exp(- L / 4), 0.), (1., 0.))
                env_params = {'L'          :  L,
                              'episodic'   :  False,
                              'rew_params' :  rew_params}

                # Define environment
                environment = DeepSea(env_params)
                environment.reset()

                # Agent constants
                agent_params = {'gamma'            : 0.9,
                                'dither_mode'      : 'boltzmann',
                                'dither_param'     : dither_param,
                                'lr'               : 0.1,
                                'Q0'               : 0.0,
                                'T'                : float('inf'),
                                'anneal_timescale' : anneal_timescale,
                                'sa_list'          : environment.sa_list()}

                # Define agent
                agent = QLearningAgent(agent_params)

                # Run experiment
                run_experiment(environment,
                               agent,
                               seed=seed,
                               num_episodes=1,
                               num_frames_per_episode=30000,
                               num_offline_frames=1000,
                               num_oracle_iter=3*L,
                               save_every=500)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

# Plotting

In [7]:
env_name = 'DeepSea-{}'

num_frames = 30000
save_every = 500
color = 'orange'

for epsilon in [0.2]:
    for L in tqdm([4, 8, 12, 16, 20]):

        online_regrets = []
        offline_regrets = []

        for seed in tqdm(range(10)):

            offline_regrets.append([])

            # Environment constants
            rew_params = ((0., 0.), (-1e-1 * np.exp(- L / 4), 0.), (1., 0.))
            env_params = {'L'          :  L,
                          'episodic'   :  False,
                          'rew_params' :  rew_params}

            # Define environment
            environment = DeepSea(env_params)
            environment.reset()

            # Agent constants
            agent_params = {'gamma'            : 0.9,
                            'kappa'            : 1.0,
                            'mu0'              : 0.0,
                            'lamda'            : 1.0,
                            'alpha'            : 2.0,
                            'beta'             : 2.0,
                            'num_pi_iter'      : 40,
                            'T'                : float('inf'),
                            'num_dyn_samples'  : 100,
                            'sa_list'          : environment.sa_list()}

            # Define agent
            agent = PSRLAgent(agent_params)

            oracle_r = run_oracle_experiment(environment,
                                             seed=seed,
                                             gamma=0.9,
                                             num_iter=2*L,
                                             num_episodes=1,
                                             num_frames_per_episode=num_frames)[2]

            agent_ = load_agent(environment, agent, seed=seed, episode=0, frame=num_frames)

            online_regrets.append(np.cumsum(oracle_r - agent_.train_r))

            for frame in np.arange(0, num_frames+1, save_every):
                agent_ = load_agent(environment, agent, seed=seed, episode=0, frame=frame)
                offline_regrets[-1].append(agent_.offline_regret / 1000)

        online_regrets = np.array(online_regrets)
        online_mu = online_regrets.mean(axis=0)
        online_std = online_regrets.var(axis=0)**0.5

        offline_regrets = np.array(offline_regrets)
        offline_mu = offline_regrets.mean(axis=0)
        offline_std = offline_regrets.var(axis=0)**0.5

        plt.figure()
        plt.plot(online_mu, color=color, label='Mean')
        plt.fill_between(np.arange(len(online_mu)),
                         online_mu + online_std,
                         online_mu - online_std,
                         color=color,
                         alpha=0.2,
                         label='$\pm$ St. dev.')

        title = 'QL ($\epsilon = {}$) on DeepSea ($N$ = {})\nOnline oracle regret (cumulative)'.format(epsilon, L)
        plt.title(title.format(L), fontsize=22)
        plt.xlabel('# of timesteps', fontsize=20)
        plt.ylabel('Oracle regret (cumulative)', fontsize=20)
        plt.locator_params(axis='y', nbins=5)
        plt.locator_params(axis='x', nbins=5)
        plt.legend()
        plt.gca().set_xlim(left=0, right=num_frames)
        plt.tight_layout()
        plt.savefig(figsave_loc + 'online-epsilon-{}-'.format(str(epsilon).replace('.', '_')) \
                     + env_name.format(L) + '.pdf')
        plt.show()

        plt.figure()
        plt.plot(np.arange(len(offline_mu)) * save_every, offline_mu, color=color, label='Mean')
        plt.fill_between(np.arange(len(offline_mu)) * save_every,
                         offline_mu + offline_std,
                         offline_mu - offline_std,
                         color=color,
                         alpha=0.2,
                         label='$\pm$ St. dev.')

        title = 'PSRL on DeepSea ($N$ = {})\nOffline oracle regret'.format(epsilon, L)
        plt.title(title.format(L), fontsize=22)
        plt.xlabel('# of timesteps', fontsize=20)
        plt.ylabel('Oracle regret (per step)', fontsize=20)
        plt.locator_params(axis='y', nbins=5)
        plt.locator_params(axis='x', nbins=5)
        plt.gca().set_xlim(left=0, right=num_frames)
        plt.legend()
        plt.tight_layout()
        plt.savefig(figsave_loc + 'offline-epsilon-{}-'.format(str(epsilon).replace('.', '_')) \
                     + env_name.format(L) + '.pdf')
        plt.show()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




FileNotFoundError: [Errno 2] No such file or directory: '/big/tabular_results/DeepSea-E-L_4-mul_0.0-mur_-0.036787944117144235-mut_1.0-sigl_0.0-sigr_0.0-sigt_0.0/QLearningAgent_dither-epsilon-greedy_ditherparam-0.2_gamma-0.9_lr-0.1_Q0-0.0-tscale-10000/seed-1/chkpt_0_10000'