In [1]:
import numpy as np
import random

class FrozenLake:
  def __init__(self):
    #Initialize the model (4 pts)
    self.height = 4
    self.width = 4
    self.state_space = [(x,y) for x in range(self.height) for y in range(self.width)]         #Define state space
    self.action_space = [0,1,2,3]        #Define action space (0: Up, 1: Down, 2: Left, 3: Right)
    self.start_state = (0,0)           #Define start point
    self.goal_state = (3,3)            #Define goal point
    self.holes = [(0,2),(1,0),(2,1),(2,3)]               #Define holes
    self.state = self.start_state

  def reset(self):
    self.state = self.start_state
    return self.state

  def step(self, action):
    #Define reward-state transition (4 pts)
    x = self.state[0]
    y = self.state[1]

    if action == 0:
      x = max(x-1,0)               #state transition after choosing Up


    elif action == 1:
      x = min(x+1,3)             #state transition after choosing Down


    elif action == 2:
      y = max(y-1,0)             #state transition after choosing Left


    elif action == 3:
      y = min(y+1,3)            #state transition after choosing Right

    self.state = (x, y)

    if self.state in self.holes:
      return self.state, -2, True                     #(terminal state, reward, done)

    elif self.state == self.goal_state:
      return  self.state, 2, True                     #(terminal state, reward, done)

    else:
      return  self.state, 0, False                      #(non-terminal state, reward, done)


In [2]:
def equiprobable_random_policy(state):
  return np.random.choice([0, 1, 2, 3])

In [3]:
#Generate 10 episodes (2 pts) - Do not change, just run
np.random.seed(1)
env = FrozenLake()

i = 1
while i <= 10:
  episode = []
  state = env.reset()
  done = False
  while (not done):
    action = equiprobable_random_policy(state)
    next_state, reward, done = env.step(action)
    episode.append((state, action, reward))
    state = next_state
  episode.append((state, 'Terminal'))
  print('Episode', i, ':', episode)
  i += 1

Episode 1 : [((0, 0), 1, -2), ((1, 0), 'Terminal')]
Episode 2 : [((0, 0), 3, 0), ((0, 1), 0, 0), ((0, 1), 0, 0), ((0, 1), 3, -2), ((0, 2), 'Terminal')]
Episode 3 : [((0, 0), 1, -2), ((1, 0), 'Terminal')]
Episode 4 : [((0, 0), 3, 0), ((0, 1), 1, 0), ((1, 1), 3, 0), ((1, 2), 0, -2), ((0, 2), 'Terminal')]
Episode 5 : [((0, 0), 0, 0), ((0, 0), 1, -2), ((1, 0), 'Terminal')]
Episode 6 : [((0, 0), 0, 0), ((0, 0), 3, 0), ((0, 1), 1, 0), ((1, 1), 0, 0), ((0, 1), 2, 0), ((0, 0), 1, -2), ((1, 0), 'Terminal')]
Episode 7 : [((0, 0), 2, 0), ((0, 0), 0, 0), ((0, 0), 2, 0), ((0, 0), 1, -2), ((1, 0), 'Terminal')]
Episode 8 : [((0, 0), 2, 0), ((0, 0), 0, 0), ((0, 0), 3, 0), ((0, 1), 0, 0), ((0, 1), 2, 0), ((0, 0), 0, 0), ((0, 0), 1, -2), ((1, 0), 'Terminal')]
Episode 9 : [((0, 0), 2, 0), ((0, 0), 2, 0), ((0, 0), 0, 0), ((0, 0), 3, 0), ((0, 1), 3, -2), ((0, 2), 'Terminal')]
Episode 10 : [((0, 0), 1, -2), ((1, 0), 'Terminal')]


In [4]:
def argmax(arr):
    return np.random.choice([idx for idx in range(len(arr)) if arr[idx] == arr.max()])

In [5]:
class MC_Control:
  def __init__(self, env, gamma = 0.9, epsilon = 0.2):
    #Initialize the model (2 pts)
    self.env = env
    self.gamma = gamma
    self.epsilon = epsilon
    self.q_table = {state: np.zeros(len(self.env.action_space)) for state in self.env.state_space}
    self.returns = {state: {action: 0 for action in self.env.action_space} for state in self.env.state_space}
    self.visits = {state: {action: 0 for action in self.env.action_space} for state in self.env.state_space}

  #Define epsilon-greedy (2 pts)
  def epsilon_greedy(self, state):
    if np.random.rand() < self.epsilon:
      return np.random.choice(self.env.action_space)
    else:
      return argmax(self.q_table[state])

  #Generate episodes (2 pts)
  def generate_episode(self):
    episode = []
    state = self.env.reset()
    done = False

    while (not done):
      action = self.epsilon_greedy(state)
      next_state, reward, done = self.env.step(action)
      episode.append((state, action, reward))
      state = next_state
    return episode


  #Update first-visit Q table (10 pts)
  def update_q(self, episode):
    G = 0

    first_visit = {}
    # Store first visit of (s, a) in first_visit dictionary using for statement
    for t, (state, action, reward) in enumerate(episode):
      if state not in first_visit:
            first_visit[state] = {}
      if action not in first_visit[state]:
            first_visit[state][action] = t

    for t in range(len(episode)-1, -1, -1):
      state, action, reward = episode[t]
      G = reward + self.gamma*G

      # Update q_table if (s, a) is the first-visit
      if first_visit[state][action] == t:
        self.returns[state][action] += G
        self.visits[state][action] += 1
        self.q_table[state][action] = self.returns[state][action]/self.visits[state][action]

  def control(self, episodes=10000):
    for i in range(1, episodes+1):
      if i % int(episodes/10) == 0:
        print('Episode ', i, '/', episodes, ': ', '[', '*'*int(i/(episodes/10)), '-'*int((episodes - i)/(episodes/10)), ']')
      episode = self.generate_episode()
      self.update_q(episode)

    policy = {state: argmax(actions) for state, actions in self.q_table.items()}
    return policy, self.q_table

In [6]:
env = FrozenLake()

MC_FrozenLake = MC_Control(env)
opt_policy, q_table = MC_FrozenLake.control()

Episode  1000 / 10000 :  [ * --------- ]
Episode  2000 / 10000 :  [ ** -------- ]
Episode  3000 / 10000 :  [ *** ------- ]
Episode  4000 / 10000 :  [ **** ------ ]
Episode  5000 / 10000 :  [ ***** ----- ]
Episode  6000 / 10000 :  [ ****** ---- ]
Episode  7000 / 10000 :  [ ******* --- ]
Episode  8000 / 10000 :  [ ******** -- ]
Episode  9000 / 10000 :  [ ********* - ]
Episode  10000 / 10000 :  [ **********  ]


In [7]:
#Optimal policy - Do not change, just run(4 pts)
policy = [[0 for _ in range(4)] for _ in range(4)]

for state, action in opt_policy.items():
  if action == 0:
    policy[state[0]][state[1]] = '^'
  elif action == 1:
    policy[state[0]][state[1]] = 'v'
  elif action == 2:
    policy[state[0]][state[1]] = '<'
  elif action == 3:
    policy[state[0]][state[1]] = '>'

policy[0][2], policy[1][0], policy[2][1], policy[2][3] = 'H', 'H', 'H', 'H'
policy[3][3] = 'G'
policy

[['^', '<', 'H', '>'],
 ['H', '^', '>', '<'],
 ['^', 'H', '^', 'H'],
 ['<', '>', 'v', 'G']]