In [1]:
# imports
from OfflineSRL.MDPDataset.old_dataset import *
from OfflineSRL.MDP.old_MDP import MDP
from OfflineSRL.MDP.ChainBandit import ChainBanditMDP
from OfflineSRL.MDP.ChainBanditState import ChainBanditState
from OfflineSRL.BPolicy.ChainBanditPolicy import ChainBanditPolicy
from OfflineSRL.OfflineLearners.offlineLearners import VI, PVI, SPVI, PesBandit

import copy
import numpy as np

In [2]:
def get_dataset(horizon = 3, neps = 50):
    # Initialize MDP and Policy
    mdp = ChainBanditMDP(num_states = horizon)
    policy = ChainBanditPolicy(mdp)
    
    # Generate data
    observations = []
    actions = []
    rewards = []
    terminals = []
    for eps in range(neps):
        for timestep in range(horizon+2):
            # Get state.
            # Add state to list.
            cur_state = copy.deepcopy(mdp.cur_state)
            observations.append(copy.deepcopy(cur_state.num_list))

            # Get action
            # Add action to list
            cur_action = policy._get_action(state = cur_state)
            actions.append(copy.deepcopy(cur_action))

            # Execute action
            reward, next_state = mdp.execute_agent_action(cur_action)
            # Add reward
            rewards.append(copy.deepcopy(reward))

            terminals.append(0)
        mdp.reset()
        terminals[-1] = 1
        
    # Convert to MDPDataset format
    observations = np.array(observations)
    actions = np.array(actions)
    rewards = np.array(rewards)
    terminals = np.array(terminals)

    dataset = MDPDataset(
        observations=observations,
        actions=actions,
        rewards=rewards,
        terminals=terminals,
    )
    
    return observations, policy, dataset

In [3]:
def evaluate_learner(option, observations, policy, dataset, horizon, neps = 5000):
    max_step_reward = 1
    abs_max_ep_reward = 1
    min_step_reward = 0
    if option == "VI":
        vi = VI(name = "vi", states = observations, actions = policy.actions, epLen = horizon)
    if option == "PVI":
        vi = PVI(name = "pvi", states = observations, actions = policy.actions, epLen = horizon, 
                 max_step_reward = max_step_reward, min_step_reward = min_step_reward, abs_max_ep_reward = abs_max_ep_reward)
    if option == "SPVI":
        vi = SPVI(name = "spvi", states = observations, actions = policy.actions, epLen = horizon, bpolicy = policy,
                  max_step_reward = max_step_reward, min_step_reward = min_step_reward, abs_max_ep_reward = abs_max_ep_reward)
    if option == "PSL":
        vi = PesBandit(name = "psl", states = observations, actions = policy.actions, epLen = horizon)
    vi.fit(dataset)
    mdp = ChainBanditMDP(num_states = horizon)
    viobservations = []
    viactions = []
    virewards = []
    viterminals = []
    for eps in range(neps):
        for timestep in range(horizon):
            # Get state.
            # Add state to list.
            cur_state = copy.deepcopy(mdp.cur_state)
            viobservations.append(copy.deepcopy(cur_state.num_list))

            # Get action
            # Add action to list
            cur_action = vi.act(copy.deepcopy(cur_state.num_list), timestep)
            viactions.append(copy.deepcopy(cur_action))

            # Execute action
            reward, next_state = mdp.execute_agent_action(cur_action)
            # Add reward
            virewards.append(copy.deepcopy(reward))

            viterminals.append(0)
        mdp.reset()
        viterminals[-1] = 1
    return np.sum(np.array(virewards))/neps

In [None]:
rew_dict = {}
option_list = ["PSL","PVI","SPVI"]
for option in option_list:
    rew_dict[option] = {}
n_runs = 5
horizon = 3
#neps_list = [10000*(i+2) for i in range(4)]
neps_list = [25, 50, 75] + [(i+1)*100 for i in range(50)]
#[25*(i+1) for i in range(4)] + [100*(i+2) for i in range(9)] + [5000*(i+1) for i in range(4)]
#[1000*(i+2) for i in range(9)] 
#[25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000]
for neps in neps_list:
    print(neps)
    for option in option_list:
        rew_dict[option][neps] = []
    for run in range(n_runs):
        observations, policy, dataset = get_dataset(horizon = horizon, neps = neps)
        for option in option_list:
            rew_dict[option][neps].append(evaluate_learner(option, copy.deepcopy(observations), policy, dataset, horizon))
            #print(option)
            #print(option, neps, evaluate_learner(option, copy.deepcopy(observations), policy, dataset, horizon))

25


In [None]:
#rew_dict

In [None]:
rew = {}
err = {}
for option in option_list:
    rew[option] = []
    err[option] = []
for neps in neps_list:
    for option in option_list:
        rew[option].append(np.mean(rew_dict[option][neps]))
        err[option].append(np.std(rew_dict[option][neps])/np.sqrt(n_runs))

In [None]:
import matplotlib.pyplot as plt
# if using a Jupyter notebook, include:
%matplotlib inline

fig, ax = plt.subplots()
for option in option_list:
    x = neps_list
    y = rew[option]
    yerr = err[option]
    ax.errorbar(x, y,
                yerr=yerr,
                fmt='-o', label = option)


ax.set_xlabel('Number of training episodes')
ax.set_ylabel('Test reward')
ax.set_title('Chain Bandit: horizon = '+str(horizon))
plt.legend()

plt.savefig('chainbandit-h=3.png')
plt.show()