In [1]:
import itertools
from typing import Iterable, Iterator, Mapping, Set, Tuple, Callable
from rl.markov_process import NonTerminal
from rl.approximate_dynamic_programming import S, A

Question 2a. (Tabular SARSA with GLIE)

In [33]:
from operator import itemgetter
from rl.distribution import Categorical
from rl.markov_decision_process import MarkovDecisionProcess
from rl.approximate_dynamic_programming import NTStateDistribution
from collections import defaultdict
#Epsilon greedy for tabular case
def epsilon_greedy_action_Tabular(
    q: Mapping[Tuple[S, A],float],
    nt_state: NonTerminal[S],
    actions: Set[A],
    epsilon: float
): #-> void
    greedy_action: A = max(
        ((a, q[(nt_state, a)]) for a in actions),
        key = itemgetter(1)
    )[0]
    return Categorical(
        {a: epsilon / len(actions) +
         (1 - epsilon if a == greedy_action else 0.) for a in actions}
    ).sample()

#implementation of tabular sarsa
def glie_sarsa_Tabular(
        mdp: MarkovDecisionProcess[S,A],
        ini_dist: NTStateDistribution[S],
        γ: float,
        epsilon_as_func_of_episodes: Callable[[int], float],
        max_episode_length: int,
        alph = 0.03,
        beta = 0.5,
        H = 1000
) -> Iterator[Mapping[Tuple[S,A], float]]:
    count_to_weight_func: Callable[[int], float] = lambda n: alph/(1+((n-1)/H)**beta)
    q: Mapping[Tuple[S,A], float] = defaultdict(lambda: 0.)
    yield q

    total_counts: int = 0
    epi_counts: int = 0
    while True: #loop for one episode
        #initilization for one loop of episode
        epi_counts += 1 #counting loops
        epsilon: float = epsilon_as_func_of_episodes(epi_counts)
        state: NonTerminal[S] = ini_dist.sample() #initial state
        action: A = epsilon_greedy_action_Tabular(
                    q = q,
                    nt_state = state,
                    actions = set(mdp.actions(state)),
                    epsilon = epsilon
        )
        for _ in range(max_episode_length):
            total_counts += 1
            next_state, reward = mdp.step(state, action).sample()
            if isinstance(next_state, NonTerminal):
                #ε-greedy
                next_action: A = epsilon_greedy_action_Tabular(
                                 q = q,
                                 nt_state = next_state,
                                 actions = set(mdp.actions(next_state)),
                                 epsilon = epsilon
                )
                #Q-update
                weight: float = count_to_weight_func(total_counts)
                q[(state, action)] += weight * \
                                      (reward + γ * q[(next_state, next_action)] - q[(state, action)])
                state = next_state
                action = next_action

                yield q
            else:
                #Q-update
                weight: float = count_to_weight_func(total_counts)
                q[(state, action)] += weight * (reward - q[(state, action)])
                state = next_state

                yield q
                break

Question 3.a (Tabular Q-learning)

In [35]:
def q_learning_Tabular(
        mdp: MarkovDecisionProcess[S,A],
        #policy_from_q is ε-greedy
        ini_dist: NTStateDistribution[S],
        γ: float,
        epsilon_as_func_of_episodes: Callable[[int], float],
        max_episode_length: float,
        alph = 0.03,
        beta = 0.5,
        H = 1000
) -> Iterator[Mapping[Tuple[S,A],float]]:
    count_to_weight_func: Callable[[int], float] = lambda n: alph/(1+((n-1)/H)**beta)
    q: Mapping[Tuple[S,A], float] = defaultdict(lambda: 0.)

    yield q

    total_counts: int = 0
    epi_counts: int = 0
    while True: #loop for one episode
        epi_counts += 1
        epsilon: float = epsilon_as_func_of_episodes(epi_counts)
        state: NonTerminal[S] = ini_dist.sample()
        for _ in range(max_episode_length):
            total_counts += 1
            if isinstance(state, NonTerminal):
                action = epsilon_greedy_action_Tabular(
                q = q,
                nt_state = state,
                actions = set(mdp.actions(state)),
                epsilon = epsilon
                )

                next_state, reward = mdp.step(state, action).sample()
                greedy_reward = max(
                    ((a, q[(next_state, a)]) for a in mdp.actions(next_state)),
                    key = itemgetter(1)
                )[1]
                #Q-update
                weight = count_to_weight_func(total_counts)
                q[(state, action)] += weight * (reward + γ * greedy_reward - q[(state, action)])

                state = next_state

                yield q

            else:
                #Q-update
                weight: float = count_to_weight_func(total_counts)
                q[(state, action)] += weight * (reward - q[(state, action)])
                state = next_state

                yield q
                break

Test against DP on SimpleInventoryMDPCap in rl/chapter3/simple inventory mdp cap.py.

In [30]:
#construct test process
from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap
from pprint import pprint
from rl.markov_decision_process import FiniteMarkovDecisionProcess

user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mdp: FiniteMarkovDecisionProcess[S, int] =\
    SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )

In [None]:
#DP from BOOK
from rl.dynamic_programming import policy_iteration_result
from rl.distribution import Choose
import itertools
import rl.iterate as iterate
from rl.policy import FiniteDeterministicPolicy

opt_vf_pi, opt_policy_pi = policy_iteration_result(
        si_mdp,
        gamma=user_gamma
    )

print("DP from the book:\n")
pprint(opt_vf_pi)
print(opt_policy_pi)
print()

In [34]:
#self-made Tabular SARSA
max_episode_length: int = 100
epsilon_as_func_of_episodes: Callable[[int], float] = lambda k: k ** -0.5
qvfs_Tabular: Iterator[Mapping[Tuple[S, int], float]] = glie_sarsa_Tabular(
              mdp = si_mdp,
              ini_dist = Choose(si_mdp.non_terminal_states),
              γ = user_gamma,
              epsilon_as_func_of_episodes = epsilon_as_func_of_episodes,
              max_episode_length = max_episode_length
)

num_episodes: int = 10000
num_updates:int = num_episodes * max_episode_length
final_qvfs_Tabular: Mapping[Tuple[S,int], float] =\
     iterate.last(itertools.islice(qvfs_Tabular, num_updates))
opt_vf_sarsa_Tabular: Mapping[S,float] = {
        s: max(final_qvfs_Tabular[(s, a)] for a in si_mdp.actions(s))
        for s in si_mdp.non_terminal_states
    }
opt_policy_sarsa_Tabular: Mapping[S, int] = \
    FiniteDeterministicPolicy({
        s.state: max(
        ((a, final_qvfs_Tabular[(s, a)]) for a in si_mdp.actions(s)),
        key = itemgetter(1))[0]
        for s in si_mdp.non_terminal_states
    })

print("Self-made GLIE SARSA in Tabular version")
pprint(opt_vf_sarsa_Tabular)
pprint(opt_policy_sarsa_Tabular)
print()

Self-made GLIE SARSA in Tabular version
{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.02253108881253,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.131535792838644,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.746767672058823,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.75600029052218,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.197936416749073,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.172902700207782}
For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0




In [36]:
#self-made Tabular Q-learning
max_episode_length: int = 100
epsilon_as_func_of_episodes: Callable[[int], float] = lambda k: k ** -0.5
qvfs_qlearning_Tabular: Iterator[Mapping[Tuple[S, int], float]] = q_learning_Tabular(
              mdp = si_mdp,
              ini_dist = Choose(si_mdp.non_terminal_states),
              γ = user_gamma,
              epsilon_as_func_of_episodes = epsilon_as_func_of_episodes,
              max_episode_length = max_episode_length
)

num_episodes: int = 10000
num_updates:int = num_episodes * max_episode_length
final_qvfs_qlearning_Tabular: Mapping[Tuple[S,int], float] =\
     iterate.last(itertools.islice(qvfs_qlearning_Tabular, num_updates))
opt_vf_qlearning_Tabular: Mapping[S,float] = {
        s: max(final_qvfs_qlearning_Tabular[(s, a)] for a in si_mdp.actions(s))
        for s in si_mdp.non_terminal_states
    }
opt_policy_qlearning_Tabular: Mapping[S, int] = \
    FiniteDeterministicPolicy({
        s.state: max(
        ((a, final_qvfs_qlearning_Tabular[(s, a)]) for a in si_mdp.actions(s)),
        key = itemgetter(1))[0]
        for s in si_mdp.non_terminal_states
    })

print("Self-made Q-learning in Tabular version")
pprint(opt_vf_qlearning_Tabular)
pprint(opt_policy_qlearning_Tabular)
print()

Self-made Q-learning in Tabular version
{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.86137720551516,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.983249350738138,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.639134957723886,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.65995403367464,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.016427795889733,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.96233099982073}
For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0


