In [1]:
from MDPDataset.old_dataset import *
from MDP.old_MDP import MDP
from MDP.ChainBandit import ChainBanditMDP
from MDP.ChainBanditState import ChainBanditState
from BPolicy.ChainBanditPolicy import ChainBanditPolicy
import copy

In [2]:
horizon = 3
mdp = ChainBanditMDP(num_states = horizon)
policy = ChainBanditPolicy(mdp)

In [3]:
neps = 50
observations = []
actions = []
rewards = []
terminals = []
for eps in range(neps):
    for timestep in range(horizon+1):
        # Get state.
        # Add state to list.
        cur_state = copy.deepcopy(mdp.cur_state)
        observations.append(copy.deepcopy(cur_state.num_list))

        # Get action
        # Add action to list
        cur_action = policy._get_action(state = cur_state)
        actions.append(copy.deepcopy(cur_action))

        # Execute action
        reward, next_state = mdp.execute_agent_action(cur_action)
        # Add reward
        rewards.append(copy.deepcopy(reward))
        
        terminals.append(0)
    mdp.reset()
    terminals[-1] = 1

In [4]:
import numpy as np

observations = np.array(observations)
actions = np.array(actions)
rewards = np.array(rewards)
terminals = np.array(terminals)

dataset = MDPDataset(
    observations=observations,
    actions=actions,
    rewards=rewards,
    terminals=terminals,
)

In [5]:
# first episode
episode = dataset.episodes[20]

# access to episode data
print(episode.observations)
print(episode.actions)
print(episode.rewards)

[[ 1  0]
 [ 2  0]
 [ 3  1]
 [-1 -1]]
[[ 1]
 [-1]
 [-1]
 [ 0]]
[0.  0.5 0.  0. ]


In [6]:
# first transition
transition = episode.transitions[0]

# access to tuple
print(transition.observation)
print(transition.action)
print(transition.reward)
print(transition.next_observation)
print(transition.terminal)

[1 0]
[1]
0.0
[2 0]
0.0


In [8]:
episode.transitions[2].next_observation

array([-1, -1])

In [9]:
from OfflineLearners.offlineLearners import VI, PVI, SPVI

In [10]:
vi = VI(name = "vi", states = observations, actions = policy.actions, epLen = horizon)
vi.fit(dataset)

In [11]:
mdp = ChainBanditMDP(num_states = horizon)
neps = 10000
viobservations = []
viactions = []
virewards = []
viterminals = []
for eps in range(neps):
    for timestep in range(horizon):
        # Get state.
        # Add state to list.
        cur_state = copy.deepcopy(mdp.cur_state)
        viobservations.append(copy.deepcopy(cur_state.num_list))

        # Get action
        # Add action to list
        cur_action = vi.act(copy.deepcopy(cur_state.num_list), timestep)
        viactions.append(copy.deepcopy(cur_action))

        # Execute action
        reward, next_state = mdp.execute_agent_action(cur_action)
        # Add reward
        virewards.append(copy.deepcopy(reward))
        
        viterminals.append(0)
    mdp.reset()
    viterminals[-1] = 1
print("vi rewards: ",np.sum(np.array(rewards))/neps)

vi rewards:  0.00175


In [12]:
pvi = PVI(name = "pvi", states = observations, actions = policy.actions, epLen = horizon)
pvi.fit(dataset)

In [13]:
mdp = ChainBanditMDP(num_states = horizon)
neps = 10000
pviobservations = []
pviactions = []
pvirewards = []
pviterminals = []
for eps in range(neps):
    for timestep in range(horizon):
        # Get state.
        # Add state to list.
        cur_state = copy.deepcopy(mdp.cur_state)
        pviobservations.append(copy.deepcopy(cur_state.num_list))

        # Get action
        # Add action to list
        cur_action = vi.act(copy.deepcopy(cur_state.num_list), timestep)
        pviactions.append(copy.deepcopy(cur_action))

        # Execute action
        reward, next_state = mdp.execute_agent_action(cur_action)
        # Add reward
        pvirewards.append(copy.deepcopy(reward))
        
        pviterminals.append(0)
    mdp.reset()
    pviterminals[-1] = 1
print("pvi rewards: ",np.sum(np.array(pvirewards))/neps)

pvi rewards:  0.293


In [14]:
spvi = SPVI(name = "pvi", states = observations, actions = policy.actions, epLen = horizon, bpolicy = policy)
spvi.fit(dataset)

({(0, 2): array([0., 0., 0.], dtype=float32), (1, 2): array([0., 0., 0.], dtype=float32), (2, 2): array([0., 0., 0.], dtype=float32), (3, 2): array([0., 0., 0.], dtype=float32), (4, 2): array([0., 0., 0.], dtype=float32), (5, 2): array([0., 0., 0.], dtype=float32), (6, 2): array([0., 0., 0.], dtype=float32), (0, 1): array([0., 0., 0.], dtype=float32), (1, 1): array([0., 0., 0.], dtype=float32), (2, 1): array([0., 0., 0.], dtype=float32), (3, 1): array([0., 0., 0.], dtype=float32), (4, 1): array([0., 0., 0.], dtype=float32), (5, 1): array([0., 0., 0.], dtype=float32), (6, 1): array([0., 0., 0.], dtype=float32), (0, 0): array([0., 0., 0.], dtype=float32), (1, 0): array([0., 0., 0.], dtype=float32), (2, 0): array([0., 0., 0.], dtype=float32), (3, 0): array([0., 0., 0.], dtype=float32), (4, 0): array([0., 0., 0.], dtype=float32), (5, 0): array([0., 0., 0.], dtype=float32), (6, 0): array([0., 0., 0.], dtype=float32)}, {3: array([0., 0., 0., 0., 0., 0., 0.], dtype=float32), 2: array([0., 0.,

In [16]:
mdp = ChainBanditMDP(num_states = horizon)
neps = 10000
spviobservations = []
spviactions = []
spvirewards = []
spviterminals = []
for eps in range(neps):
    for timestep in range(horizon):
        # Get state.
        # Add state to list.
        cur_state = copy.deepcopy(mdp.cur_state)
        spviobservations.append(copy.deepcopy(cur_state.num_list))

        # Get action
        # Add action to list
        cur_action = vi.act(copy.deepcopy(cur_state.num_list), timestep)
        spviactions.append(copy.deepcopy(cur_action))

        # Execute action
        reward, next_state = mdp.execute_agent_action(cur_action)
        # Add reward
        spvirewards.append(copy.deepcopy(reward))
        
        spviterminals.append(0)
    mdp.reset()
    spviterminals[-1] = 1
print("spvi rewards: ",np.sum(np.array(spvirewards))/neps)

spvi rewards:  0.29025


In [None]:
observations