In [1]:
import sys
sys.path.append('/Users/mac/Downloads/RL-book-master')

In [2]:
from typing import Iterable, Iterator, Tuple, TypeVar, Mapping

from rl.distribution import (Bernoulli, Constant, Categorical, Choose,
                             Distribution, FiniteDistribution)
from rl.function_approx import FunctionApprox
import rl.markov_process as mp
import rl.markov_decision_process as markov_decision_process
from rl.markov_decision_process import (MarkovDecisionProcess, Policy)
from rl.returns import returns
from collections import defaultdict
import numpy as np
S = TypeVar('S')
A = TypeVar('A')

In [3]:
def my_policy_from_q(
        q: Mapping[Tuple[S,A],float],
        mdp: MarkovDecisionProcess[S, A],
        ϵ: float = 0.0
) -> Policy[S, A]:
    explore = Bernoulli(ϵ)

    class QPolicy(Policy[S, A]):
        def act(self, s: S) -> Optional[Distribution[A]]:
            if mdp.is_terminal(s):
                return None

            if explore.sample():
                return Choose(set(mdp.actions(s)))
            
            action = None
            maximum = -np.Inf
            for a in mdp.actions(s):
                if q[(s, a)] > maximum:
                    maximum = q[(s, a)]
                    action = a
            return Constant(action)

    return QPolicy()

In [4]:
def tabular_mc_control_glie(
        mdp: MarkovDecisionProcess[S, A],
        states: Distribution[S],
        γ: float,
        episode_num : int = 10000,
        tolerance: float = 1e-6
) -> Iterator[FunctionApprox[Tuple[S, A]]]:
    process = []
    counts = {}
    q = defaultdict(int)
    p = my_policy_from_q(q, mdp)
    for i in range(episode_num):
        trace = mdp.simulate_actions(states, p)
        episode = returns(trace,γ,tolerance)
        epsilon = 1. / (1 + i)
        for step in episode:
            counts[(step.state, step.action)] = counts.get((step.state, step.action), 0) + 1
            q[(step.state, step.action)] = (1 - 1. / counts[(step.state, step.action)]) *\
            q.get((step.state, step.action), 0) + 1. / counts[(step.state, step.action)] * step.return_
        process.append(q)
        p = my_policy_from_q(q, mdp, epsilon)
    return process

In [5]:
def mc_control_glie(
        mdp: MarkovDecisionProcess[S, A],
        states: Distribution[S],
        approx_0: FunctionApprox[Tuple[S, A]],
        γ: float,
        tolerance: float = 1e-6
) -> Iterator[FunctionApprox[Tuple[S, A]]]:
   
    q = approx_0
    p = markov_decision_process.policy_from_q(q, mdp)
    episode = 1
    while True:
        trace: Iterable[markov_decision_process.TransitionStep[S, A]] =\
            mdp.simulate_actions(states, p)
        q = q.update(
            ((step.state, step.action), step.return_)
            for step in returns(trace, γ, tolerance)
        )
        episode += 1
        p = markov_decision_process.policy_from_q(q, mdp, 1./ episode)
        yield q