In [1]:
import numpy as np

In [2]:
#Declaring the initial distribution
v1 = np.array([[1.0, 0.0]])

#Declaring the Transition Matrix T
T = np.array([[0.90, 0.10],
              [0.50, 0.50]])

In [3]:
#Obtaining T after 3 steps
T_3 = np.linalg.matrix_power(T, 3)
#Obtaining T after 50 steps
T_50 = np.linalg.matrix_power(T, 50)
#Obtaining T after 100 steps
T_100 = np.linalg.matrix_power(T, 100)

In [4]:
#Printing the matrices
print("T:\n " + str(T))
print("T_3: \n" + str(T_3))
print("T_50: \n" + str(T_50))
print("T_100: \n" + str(T_100))

T:
 [[0.9 0.1]
 [0.5 0.5]]
T_3: 
[[0.844 0.156]
 [0.78  0.22 ]]
T_50: 
[[0.83333333 0.16666667]
 [0.83333333 0.16666667]]
T_100: 
[[0.83333333 0.16666667]
 [0.83333333 0.16666667]]


In [5]:
#Printing the initial distribution
print("v: " + str(v1))
print("v_1: " + str(np.dot(v1,T)))
print("v_3: " + str(np.dot(v1,T_3)))
print("v_50: " + str(np.dot(v1,T_50)))
print("v_100: " + str(np.dot(v1,T_100)))

v: [[1. 0.]]
v_1: [[0.9 0.1]]
v_3: [[0.844 0.156]]
v_50: [[0.83333333 0.16666667]]
v_100: [[0.83333333 0.16666667]]


In [6]:
#declaring different inital distribution
v2 = [0.5, 0.5]
#Printing the initial distribution
print("v: " + str(v2))
print("v_1: " + str(np.dot(v2,T)))
print("v_3: " + str(np.dot(v2,T_3)))
print("v_50: " + str(np.dot(v2,T_50)))
print("v_100: " + str(np.dot(v2,T_100)))

v: [0.5, 0.5]
v_1: [0.7 0.3]
v_3: [0.812 0.188]
v_50: [0.83333333 0.16666667]
v_100: [0.83333333 0.16666667]


### The chain converged to equilibrium meaning that as the time progresses it forgets about the starting distribution

# Markov Decision Process

- Problem the agent has to maximise the reward avoiding states which return negative values and choosing the one which return positive values.

- Solution find a policy π(s) which returns the action with the highest reward.

## Grid World
Our simple world is a 4x3 matrix
- starting point s0 is at (1,1)
- the charging station at (4,3)
- dangerous stairs at (4,2)
- an obstacle at (2,2)

the robot diverge from the original path 20% of the time. If the robot decides to go ahead in 10% of the cases it will finish on the left and in 10% of the cases on the right state. If the robot hits the wall or the obstacle it will bounce back to the previous position.

### The robot has to find the best way to reach the charging station (Reward +1) and to avoid falling down the flight of stairs (Reward -1). 

 The main characteristics of this world are the following:

1. Discrete time and space
2. Fully observable
3. Infinite horizon
4. Known Transition Model


1. R(s)≤−1.6284 extremely low battery
2. −0.4278≤R(s)≤−0.085 quite low battery
3. −0.0221≤R(s)≤0 slightly low battery
4. R(s)>0 fully charged

For each one of these conditions we can try to guess which policy the agent will choose. In the extremely low battery scenario the agent receives such an high punishment that it only wants to stop the pain as soon as possible. Life is so painful that falling down the flight of stairs is a good choice. In the quite low battery scenario the agent takes the shortest path to the charging station, it does not care about falling down. In the slightly low battery case the robot does not take risks at all and it avoids the stairs at cost of banging against the wall. Finally in the fully charged case the agent avoids both the exits and remain in a steady state receiving a positive reward at each time step.

In [7]:
data = np.load("T.npy")

In [8]:
data.shape

(12, 12, 4)

In [9]:
data[:,:,0]

array([[0.9, 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0.1, 0.8, 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.1, 0.8, 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0.8, 0. , 0. , 0. , 0.2, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.8, 0. , 0. , 0. , 0.1, 0.1, 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0.8, 0. , 0. , 0. , 0.1, 0.1, 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0.8, 0.1, 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0.8, 0. , 0. , 0.1, 0. , 0.1],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.8, 0. , 0. , 0.1, 0.1]])

## Monte Carlo Method

Suppose we do not know transtion probailites model and reward function.
Finding Transition prob is not practical alwasys use MC method to estimate the value function.

In [10]:
from gridworld import GridWorld

In [12]:
# Declare our environmnet variable
# The world has 3 rows and 4 columns
env = GridWorld(3, 4)
# Define the state matrix
# Adding obstacle at position (1,1)
# Adding the two terminal states
state_matrix = np.zeros((3,4))
state_matrix[0, 3] = 1
state_matrix[1, 3] = 1
state_matrix[1, 1] = -1
# Define the reward matrix
# The reward is -0.04 for all states but the terminal
reward_matrix = np.full((3,4), -0.04)
reward_matrix[0, 3] = 1
reward_matrix[1, 3] = -1
# Define the transition matrix
# For each one of the four actions there is a probability
transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                              [0.1, 0.8, 0.1, 0.0],
                              [0.0, 0.1, 0.8, 0.1],
                              [0.1, 0.0, 0.1, 0.8]])
# Define the policy matrix
# 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT, NaN=Obstacle, -1=NoAction
# This is the optimal policy for world with reward=-0.04
policy_matrix = np.array([[1,      1,  1,  -1],
                          [0, np.NaN,  0,  -1],
                          [0,      3,  3,   3]])
# Set the matrices 
env.setStateMatrix(state_matrix)
env.setRewardMatrix(reward_matrix)
env.setTransitionMatrix(transition_matrix)

In [13]:
#Reset the environment
observation = env.reset()
#Display the world printing on terminal
env.render()

 -  -  -  * 
 -  #  -  * 
 ○  -  -  - 



In [14]:
for _ in range(1000):
    action = policy_matrix[observation[0], observation[1]]
    #step = moves forward at t+1
    observation, reward, done = env.step(action)
    print("")
    print("ACTION: " + str(action))
    print("REWARD: " + str(reward))
    print("DONE: " + str(done))
    env.render()
    if done: break


ACTION: 0.0
REWARD: -0.04
DONE: False
 -  -  -  * 
 ○  #  -  * 
 -  -  -  - 


ACTION: 0.0
REWARD: -0.04
DONE: False
 ○  -  -  * 
 -  #  -  * 
 -  -  -  - 


ACTION: 1.0
REWARD: -0.04
DONE: False
 ○  -  -  * 
 -  #  -  * 
 -  -  -  - 


ACTION: 1.0
REWARD: -0.04
DONE: False
 -  ○  -  * 
 -  #  -  * 
 -  -  -  - 


ACTION: 1.0
REWARD: -0.04
DONE: False
 -  -  ○  * 
 -  #  -  * 
 -  -  -  - 


ACTION: 1.0
REWARD: 1.0
DONE: True
 -  -  -  ○ 
 -  #  -  * 
 -  -  -  - 

