# Determine the value function for a given policy using Monte Carlo

In [0]:
import numpy as np

### Define the graph as an MDP with states, actions & transitions
![graph](https://drive.google.com/uc?id=11XU7Qm4jlOVhCf5q5126jmS6dKFcXBtP)

Definition of states and their respective types: 
 - `0`: absorbing state
 - `1`: regular state

In [0]:
states = { 0: 1,
            1: 1,
            2: 1,
            3: 1,
            4: 1,
            5: 1,
            6: 1,
            7: 1,
            8: 1,
            9: 0
}



Mapping of states to the list of actions that can be taken. Actions are considered in a clockwise order given a state in the graph. 

In [0]:
actions = { 0: (0,1,2),
            1: (0,1),
            2: (0,1),
            3: (0,1),
            4: (0,1),
            5: (0,1, 2, 3),
            6: (0,1),
            7: (0,1),
            8: (0,1),
            9: (0,)
}

Mapping of a state and an action to the resulting state and reward: `(s1, a) -> (s2, r)` where `s1` is the current state and `a` is the action taken, resulting in state `s2` and yielding reward `r`.

In [0]:
transitions = {
    # state 0
    (0,0) : (1,-5),
    (0,1) : (2, -2),
    (0,2) : (3, 1),
    # state 1
    (1,0) : (4, -2),
    (1,1) : (2, 1),
    # state 2
    (2,0) : (5, 3),
    (2,1) : (3, 2),
    # state 3
    (3,0) : (5, -1),
    (3,1) : (6, -5),
    # state 4
    (4,0) : (7, -2),
    (4,1) : (5, 5),
    # state 5
    (5,0) : (7, -2),
    (5,1):  (8 , -4),
    (5, 2): (9, -7),
    (5, 3): (6, -2),
    # state 6
    (6,0): (5, -3),
    (6,1): (9, 1),
    # state 7
    (7,0) : (8, -4),
    (7,1): (5, -2),
    # state 8
    (8,0) :(9, 10),
    (8,1): (5, -4),
    # state 9 (absorbing state)
    (9,0) :(9, 0)
}



`MDP` defines a Markov decision process, based on the graph above.

In [0]:
MDP = {
    "states" : states,
    "actions": actions,
    "transitions" : transitions
}


Some policies..  

A policy maps a state `s` to a list of probabilities `p`, in which the `i`*th* probability `p_i` is the probability that the `i`*th* action from `actions[s]` is taken in state `s`.

In [0]:
policy1 = {
    0: [0,1,0],
    1: [0,1],
    2: [0,1],
    3: [1, 0],
    4: [1, 0],
    5: [0, 1, 0, 0],
    6: [0, 1],
    7: [1, 0],
    8: [1, 0]
}

policy2 = {
    0: [0.3,0.5,0.2],
    1: [0, 1],
    2: [0, 1],
    3: [1, 0],
    4: [1, 0],
    5: [0, 1, 0, 0],
    6: [0, 1],
    7: [1, 0],
    8: [1, 0]
}

policy3 = {
    0: [0.4,0.2,0.4],
    1: [0.7,0.3],
    2: [0.4,0.6],
    3: [0.7,0.3],
    4: [0.2,0.8],
    5: [0.25,0.25,0.25,0.25],
    6: [1, 0],
    7: [0.5,0.5],
    8: [0.6,0.4]
}


### Determine the value function for deterministic policy `policy1`.

In [0]:
number_states = 10
#value function v
v = np.zeros(10)
#number of rollouts done per state
numberRolloutsPerState = np.zeros(number_states)
#total number of sweeps
numberOfSweeps = 50


#start Monte Carlo
for sweep in np.arange(numberOfSweeps):
    #start of a sweep = loop over all states
    for s in np.arange(number_states):
        #set cumulutive reward to zero
        G = 0
        #status of the state, to check if it's absorbing or not
        statusOfCurrentState = MDP['states'][s]
        currentState = s
        while statusOfCurrentState != 0:
            #pick an action acoording to its probability
            a = np.argwhere(np.random.multinomial(1, policy1[currentState]) == 1)[0][0]
            #reward
            r = MDP['transitions'][(currentState, a)][1]
            #cumulutive reward
            G += r
            currentState = MDP['transitions'][(currentState, a)][0]
            statusOfCurrentState = MDP['states'][currentState]

        #update value function in state s
        v[s] = (v[s] * numberRolloutsPerState[s]  + G )/(numberRolloutsPerState[s]  +1)
        numberRolloutsPerState[s] += 1

In [0]:
for i in range(number_states):
  print(f"State {i}: {v[i]}")

State 0: 5.0
State 1: 8.0
State 2: 7.0
State 3: 5.0
State 4: 4.0
State 5: 6.0
State 6: 1.0
State 7: 6.0
State 8: 10.0
State 9: 0.0


![policy1](https://drive.google.com/uc?id=1UHIQdZI_aBsncB3KMhcrsbKJBPsCqdZG)

### Determine the value function for deterministic policy `policy2`.

In [0]:
number_states = 10
#value function v
v = np.zeros(number_states)
#number of rollouts done per state
numberRolloutsPerState = np.zeros(number_states)
#total number of sweeps
numberOfSweeps = 1000


#start Monte Carlo
for sweep in np.arange(numberOfSweeps):
    #start of a sweep = loop over all states (except absorbing one) 
    for s in np.arange(number_states):
        #set cumulutive reward to zero
        G = 0
        #status of the state, to check if it's absorbing or not
        statusOfCurrentState = MDP['states'][s]
        currentState = s
        while statusOfCurrentState != 0:
            #pick an action acoording to its probability
            a = np.argwhere(np.random.multinomial(1, policy2[currentState]) == 1)[0][0]
            #reward
            r = MDP['transitions'][(currentState, a)][1]
            #cumulutive reward
            G += r
            currentState = MDP['transitions'][(currentState, a)][0]
            statusOfCurrentState = MDP['states'][currentState]

        #update value function in state s
        v[s] = (v[s] * numberRolloutsPerState[s]  + G )/(numberRolloutsPerState[s]  +1)
        numberRolloutsPerState[s] += 1

In [0]:
for i in range(number_states):
  print(f"State {i}: {v[i]}")

State 0: 4.555000000000003
State 1: 8.0
State 2: 7.0
State 3: 5.0
State 4: 4.0
State 5: 6.0
State 6: 1.0
State 7: 6.0
State 8: 10.0
State 9: 0.0
