<a href="https://colab.research.google.com/github/rohitgunasekaran/monte-carlo-control/blob/main/MonteCarloControlExp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import warnings ; warnings.filterwarnings('ignore')

import gym
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123)

In [6]:
import numpy as np


if not hasattr(np, "object"):
    np.object = object
    print(" Patched np.object â†’ object (temporary fix)")

In [7]:
pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk

Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-m8p6n_vs/gym-walk_dade3f4587dd4a6dade1b0785e3ec978
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-m8p6n_vs/gym-walk_dade3f4587dd4a6dade1b0785e3ec978
  Resolved https://github.com/mimoralea/gym-walk to commit b915b94cf2ad16f8833a1ad92ea94e88159279f5
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gym-walk
  Building wheel for gym-walk (setup.py) ... [?25l[?25hdone
  Created wheel for gym-walk: filename=gym_walk-0.0.2-py3-none-any.whl size=5377 sha256=c1394f8ada072f00bd584c9228d9c8da3a07990065200208207e7c8697fec838
  Stored in directory: /tmp/pip-ephem-wheel-cache-5bxgcjgp/wheels/bf/23/e5/a94be4a90dd18f7ce958c21f192276cb01ef0daaf2bc66583b
Successfully built gym-walk
Installing collected packages: gym-walk
Successfully installed gym-walk-0.0.2


In [8]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [9]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [10]:
env = gym.make('FrozenLake-v1')
P = env.env.P
init_state = env.reset()
#goal_state = 6
#LEFT, RIGHT = range(2)

In [11]:
P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

Exponentially decaying schedule


In [12]:
def decay_schedule(
    init_value, min_value, decay_ratio,
    max_steps, log_start = -2, log_base=10):

  decay_steps = int(max_steps * decay_ratio)
  rem_steps = max_steps - decay_steps
  values = np.logspace(log_start, 0, decay_steps, base=log_base)
  values = (init_value - min_value) * values + min_value
  values = np.pad(values, (0, rem_steps), 'edge')

  return values

Exploratory Policy Trajectories

In [13]:
from itertools import count
import numpy as np

def generate_trajectory(
    select_action, Q, epsilon,
    env, max_steps=200):
  done, trajectory = False, []

  state = env.reset()
  for t in count():
      action = select_action(state, Q, epsilon)
      next_state, reward, done, _ = env.step(action)
      trajectory.append((state, action, reward, next_state))
      state = next_state
      if isinstance(done, np.bool_):
        done = bool(done)
      if done or t >= max_steps - 1:
          break

  return np.array(trajectory, np.object)


Monte Carlo control

In [14]:
def mc_control (env, gamma = 1.0,
                init_alpha = 0.5,min_alpha = 0.01, alpha_decay_ratio = 0.5,
                init_epsilon = 1.0, min_epsilon = 0.1, epsilon_decay_ratio = 0.9,
                n_episodes = 3000, max_steps = 200, first_visit = True):
  # Ensure env is the environment object, not a tuple
  if isinstance(env, tuple):
      env = env[0]

  nS, nA = env.observation_space.n, env.action_space.n

  Q = np.zeros((nS, nA))
  V = np.zeros(nS)
  pi = np.zeros(nS, dtype=int)

  returns = {(s, a): [] for s in range(nS) for a in range(nA)}

  alpha_schedule = decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
  epsilon_schedule = decay_schedule(init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes)

  def select_action(state, Q, epsilon):
      if random.random() > epsilon:
          return np.argmax(Q[state])
      else:
          return random.randrange(nA)

  for i in range(n_episodes):
      epsilon = epsilon_schedule[i]
      alpha = alpha_schedule[i]

      trajectory = generate_trajectory(select_action, Q, epsilon, env, max_steps)
      states, actions, rewards, next_states = trajectory[:,0], trajectory[:,1], trajectory[:,2], trajectory[:,3]

      G = 0
      for t in reversed(range(len(trajectory))):
          s, a, r, ns = states[t], actions[t], rewards[t], next_states[t]
          G = gamma * G + r

          if first_visit and (s, a) not in [(states[i], actions[i]) for i in range(t)]:
              returns[(s, a)].append(G)
              Q[s, a] = np.mean(returns[(s, a)])
              V[s] = np.max(Q[s])
              pi[s] = np.argmax(Q[s])
          elif not first_visit:
              returns[(s, a)].append(G)
              Q[s, a] = np.mean(returns[(s, a)])
              V[s] = np.max(Q[s])
              pi[s] = np.argmax(Q[s])

  return Q, V, pi

In [17]:
optimal_Q, optimal_V, optimal_pi = mc_control(env)
print('Name:         ROHIT G              Register Number:  212222240083        ')
print_state_value_function(optimal_V, P, n_cols=4, prec=2, title='State-value function:')
print_state_value_function(optimal_Q, P, n_cols=4, prec=2, title='Action-value function:')
print_policy(optimal_pi, P)


State-value function:
| 00   0.01 | 01   0.02 | 02   0.03 | 03   0.03 |
| 04   0.02 |           | 06   0.04 |           |
| 08   0.03 | 09   0.19 | 10   0.19 |           |
|           | 13   0.29 | 14   0.53 |           |
Action-value function:
| 00 [0.   0.01 0.01 0.  ] | 01 [0.   0.02 0.   0.01] | 02 [0.02 0.03 0.03 0.  ] | 03 [0.03 0.   0.   0.  ] |
| 04 [0.   0.02 0.01 0.  ] |           | 06 [0.03 0.03 0.04 0.  ] |           |
| 08 [0.   0.01 0.03 0.02] | 09 [0.02 0.19 0.05 0.  ] | 10 [0.11 0.12 0.19 0.04] |           |
|           | 13 [0.01 0.17 0.29 0.18] | 14 [0.35 0.5  0.53 0.42] |           |
Policy:
| 00      > | 01      v | 02      > | 03      < |
| 04      v |           | 06      > |           |
| 08      > | 09      v | 10      > |           |
|           | 13      > | 14      > |           |


In [20]:

# Find the probability of success and the mean return of you your policy
print('Name:    ROHIT G                   Register Number:   212222240083       ')
print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(
    probability_success(env, optimal_pi, goal_state=goal_state)*100,
    mean_return(env, optimal_pi)))


Name:    ROHIT G                   Register Number:   212222240083       


NameError: name 'goal_state' is not defined