In [70]:
import numpy as np
import scipy
import scipy.stats as st

from mdp import *

In [92]:
def random_mdp(ns, na, pvar=None, rvar=None):
    """ Generate a random MDP with `ns` total states, `na` total 
    actions, returning a transition matrix and reward matrix, `(P, R)`.
    
    Optionally, the random variables used to define the probability and
    reward matrices can be specified (c.f. `scipy.stats` )
    """
    if rvar is None:
        rvar = scipy.stats.uniform()
    if pvar is None:
        pvar = scipy.stats.uniform()
        
    P = np.zeros((ns, na, ns), dtype=np.float)
    R = np.zeros((ns, na, ns), dtype=np.float)
    for s, a in np.ndindex(ns, na):
        R[s, a] = rvar.rvs()
        P[s, a] = pvar.rvs(ns)
        P[s, a] = P[s, a]/np.sum(P[s, a])
    return P, R

In [73]:
def uniform_policy(ns, na):
    """ Return a uniform random policy as a matrix."""
    return (np.ones((ns, na), dtype=np.float))/na

In [76]:
pol = uniform_policy(3, 4)

In [77]:
P, R = random_mdp(3, 3)

In [80]:
pol = np.zeros((3, 3))
pol[:,0] = 1

In [82]:
np.dot(pol, np.arange(9).reshape(3,3))

array([[ 0.,  1.,  2.],
       [ 0.,  1.,  2.],
       [ 0.,  1.,  2.]])

In [78]:
np.dot()

array([[ 0.83626862,  0.02681555,  0.86952559],
       [ 0.89067484,  0.24506923,  0.82270243],
       [ 0.54573822,  0.59448958,  0.6148822 ]])

In [91]:
def policy_transition_matrix(P, pol):
    """ Get the transition matrix under a fixed policy."""
    ns, na, _ = P.shape
    P_pi = np.zeros((ns, ns), dtype=np.float)
    for s in range(ns):
        sp_vec = np.zeros(ns, dtype=np.float)
        for a in range(na):
            sp_vec += pol[s,a]*P[s,a]
        P_pi[s] = sp_vec
        
    return P_pi

In [100]:
def expected_reward(P, R, pol):
    """ Calculate the expected reward under the given policy."""
    ns, na, _ = P.shape
    rvec = np.zeros(ns, dtype=np.float)
    for s in range(ns):
        for a in range(na):
            prob_a = pol[s,a]
            for sp in range(ns):
                rvec[s] += prob_a*P[s,a,sp]*R[s,a,sp]
    return rvec

In [101]:
R = np.arange(27).reshape(3,3,3)

In [102]:
expected_reward(P, R, pol)

array([  1.26258784,  10.10298962,  18.75942815])

In [88]:
np.dot(unit(3, 0), np.dot(unit(3, 0), P))

array([ 0.01204339,  0.71332538,  0.27463123])

In [89]:
np.dot(unit(3, 0), policy_transition_matrix(P, pol))

array([ 0.01204339,  0.71332538,  0.27463123])

In [90]:
policy_transition_matrix(P, pol)

array([[ 0.01204339,  0.71332538,  0.27463123],
       [ 0.41115633,  0.07469773,  0.51414594],
       [ 0.41733229,  0.40590727,  0.17676044]])

In [51]:
s = unit(3,1)
a = unit(3,0)

array([ 3.,  4.,  5.])

In [52]:
pmat = rand_p(3, 3)
for s, a in np.ndindex(3, 3):
    print(np.sum(prob_next(s, a, pmat)))

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
