In [1]:
# imports
from MDPDataset.old_dataset import *
from MDP.old_MDP import MDP
from MDP.ChainBandit import ChainBanditMDP
from MDP.ChainBanditState import ChainBanditState
from BPolicy.ChainBanditPolicy import ChainBanditPolicy
from OfflineLearners.offlineLearners import VI, PVI, SPVI

import copy
import numpy as np

In [3]:
def get_dataset(horizon = 3, neps = 50):
    # Initialize MDP and Policy
    mdp = ChainBanditMDP(num_states = horizon)
    policy = ChainBanditPolicy(mdp)
    
    # Generate data
    observations = []
    actions = []
    rewards = []
    terminals = []
    for eps in range(neps):
        for timestep in range(horizon+1):
            # Get state.
            # Add state to list.
            cur_state = copy.deepcopy(mdp.cur_state)
            observations.append(copy.deepcopy(cur_state.num_list))

            # Get action
            # Add action to list
            cur_action = policy._get_action(state = cur_state)
            actions.append(copy.deepcopy(cur_action))

            # Execute action
            reward, next_state = mdp.execute_agent_action(cur_action)
            # Add reward
            rewards.append(copy.deepcopy(reward))

            terminals.append(0)
        mdp.reset()
        terminals[-1] = 1
        
    # Convert to MDPDataset format
    observations = np.array(observations)
    actions = np.array(actions)
    rewards = np.array(rewards)
    terminals = np.array(terminals)

    dataset = MDPDataset(
        observations=observations,
        actions=actions,
        rewards=rewards,
        terminals=terminals,
    )
    
    return observations, policy, dataset

In [9]:
def evaluate_learner(option, observations, policy, dataset, horizon, neps = 5000):
    if option == "vi":
        vi = VI(name = "vi", states = observations, actions = policy.actions, epLen = horizon)
    if option == "pvi":
        vi = PVI(name = "pvi", states = observations, actions = policy.actions, epLen = horizon)
    if option == "spvi":
        vi = SPVI(name = "pvi", states = observations, actions = policy.actions, epLen = horizon, bpolicy = policy)
    mdp = ChainBanditMDP(num_states = horizon)
    viobservations = []
    viactions = []
    virewards = []
    viterminals = []
    for eps in range(neps):
        for timestep in range(horizon):
            # Get state.
            # Add state to list.
            cur_state = copy.deepcopy(mdp.cur_state)
            viobservations.append(copy.deepcopy(cur_state.num_list))

            # Get action
            # Add action to list
            cur_action = vi.act(copy.deepcopy(cur_state.num_list), timestep)
            viactions.append(copy.deepcopy(cur_action))

            # Execute action
            reward, next_state = mdp.execute_agent_action(cur_action)
            # Add reward
            virewards.append(copy.deepcopy(reward))

            viterminals.append(0)
        mdp.reset()
        viterminals[-1] = 1
    return np.sum(np.array(virewards))/neps

In [None]:
vi = VI(name = "vi", states = observations, actions = policy.actions, epLen = horizon)
vi.fit(dataset)

In [None]:
mdp = ChainBanditMDP(num_states = horizon)
neps = 10000
viobservations = []
viactions = []
virewards = []
viterminals = []
for eps in range(neps):
    for timestep in range(horizon):
        # Get state.
        # Add state to list.
        cur_state = copy.deepcopy(mdp.cur_state)
        viobservations.append(copy.deepcopy(cur_state.num_list))

        # Get action
        # Add action to list
        cur_action = vi.act(copy.deepcopy(cur_state.num_list), timestep)
        viactions.append(copy.deepcopy(cur_action))

        # Execute action
        reward, next_state = mdp.execute_agent_action(cur_action)
        # Add reward
        virewards.append(copy.deepcopy(reward))
        
        viterminals.append(0)
    mdp.reset()
    viterminals[-1] = 1
print("vi rewards: ",np.sum(np.array(rewards))/neps)

In [None]:
pvi = PVI(name = "pvi", states = observations, actions = policy.actions, epLen = horizon)
pvi.fit(dataset)

In [None]:
mdp = ChainBanditMDP(num_states = horizon)
neps = 10000
pviobservations = []
pviactions = []
pvirewards = []
pviterminals = []
for eps in range(neps):
    for timestep in range(horizon):
        # Get state.
        # Add state to list.
        cur_state = copy.deepcopy(mdp.cur_state)
        pviobservations.append(copy.deepcopy(cur_state.num_list))

        # Get action
        # Add action to list
        cur_action = vi.act(copy.deepcopy(cur_state.num_list), timestep)
        pviactions.append(copy.deepcopy(cur_action))

        # Execute action
        reward, next_state = mdp.execute_agent_action(cur_action)
        # Add reward
        pvirewards.append(copy.deepcopy(reward))
        
        pviterminals.append(0)
    mdp.reset()
    pviterminals[-1] = 1
print("pvi rewards: ",np.sum(np.array(pvirewards))/neps)

In [None]:
spvi = SPVI(name = "pvi", states = observations, actions = policy.actions, epLen = horizon, bpolicy = policy)
spvi.fit(dataset)

In [None]:
mdp = ChainBanditMDP(num_states = horizon)
neps = 10000
spviobservations = []
spviactions = []
spvirewards = []
spviterminals = []
for eps in range(neps):
    for timestep in range(horizon):
        # Get state.
        # Add state to list.
        cur_state = copy.deepcopy(mdp.cur_state)
        spviobservations.append(copy.deepcopy(cur_state.num_list))

        # Get action
        # Add action to list
        cur_action = vi.act(copy.deepcopy(cur_state.num_list), timestep)
        spviactions.append(copy.deepcopy(cur_action))

        # Execute action
        reward, next_state = mdp.execute_agent_action(cur_action)
        # Add reward
        spvirewards.append(copy.deepcopy(reward))
        
        spviterminals.append(0)
    mdp.reset()
    spviterminals[-1] = 1
print("pvi rewards: ",np.sum(np.array(spvirewards))/neps)

In [None]:
observations