In [20]:
import numpy as np

class FrozenLake_Slippery:
  def __init__(self):
    #Initialize the model (2 pts)
    self.height = 4
    self.width = 8
    self.state_space = [(x,y) for x in range(self.height) for y in range(self.width)]         #Define state space
    self.action_space = [0,1,2,3]        #Define action space (0: Up, 1: Down, 2: Left, 3: Right)
    self.start_state = (0,0)           #Define start point
    self.goal_state = (3,7)            #Define goal point
    self.holes = [(0,4),(1,5),(2,1),(2,7),(3,2)]               #Define holes
    self.slippery = 0.15
    self.state = self.start_state

  def reset(self):
    self.state = self.start_state
    return self.state

  def step(self, action):
    #Define reward-state transition (6 pts)
    x = self.state[0]
    y = self.state[1]

    #true_action: actual movement
    if np.random.rand() > self.slippery:
      true_action = action
    else:
      actions = [a for a in [0, 1, 2, 3] if a != action]
      true_action = np.random.choice(actions)


    if true_action == 0:
      x = max(x-1,0)     #state transition after choosing Up


    elif true_action == 1:
      x = min(x+1,3)       #state transition after choosing Down


    elif true_action == 2:
      y = max(y-1,0)      #state transition after choosing Left


    elif true_action == 3:
      y = min(y+1,7)        #state transition after choosing Right



    self.state = (x, y)

    if self.state in self.holes:
      return self.state, -2, True                      #(terminal state, reward, done)

    elif self.state == self.goal_state:
      return self.state, 2, True                     #(terminal state, reward, done)

    else:
      return self.state, 0, False                     #(non-terminal state, reward, done)

In [21]:
def equiprobable_random_policy(state):
  return np.random.choice([0, 1, 2, 3])

In [22]:
#Generate 10 episodes (2 pts) - Do not change, just run
np.random.seed(1)
env = FrozenLake_Slippery()

i = 1
while i <= 10:
  episode = []
  state = env.reset()
  done = False
  while (not done):
    action = equiprobable_random_policy(state)
    next_state, reward, done = env.step(action)
    episode.append((state, action, reward))
    state = next_state
  episode.append((state, 'Terminal'))
  print('Episode', i, ':', episode)
  i += 1

Episode 1 : [((0, 0), 1, 0), ((1, 0), 0, 0), ((1, 0), 3, 0), ((1, 1), 1, -2), ((2, 1), 'Terminal')]
Episode 2 : [((0, 0), 1, 0), ((1, 0), 1, 0), ((2, 0), 2, 0), ((2, 0), 0, 0), ((1, 0), 2, 0), ((1, 0), 2, 0), ((1, 0), 3, 0), ((1, 1), 1, 0), ((0, 1), 2, 0), ((0, 0), 1, 0), ((1, 0), 1, 0), ((2, 0), 1, 0), ((3, 0), 1, 0), ((3, 0), 3, 0), ((3, 1), 1, 0), ((3, 1), 3, -2), ((3, 2), 'Terminal')]
Episode 3 : [((0, 0), 1, 0), ((1, 0), 3, 0), ((1, 1), 1, 0), ((1, 2), 0, 0), ((0, 2), 3, 0), ((0, 3), 3, -2), ((0, 4), 'Terminal')]
Episode 4 : [((0, 0), 1, 0), ((1, 0), 3, 0), ((1, 1), 1, -2), ((2, 1), 'Terminal')]
Episode 5 : [((0, 0), 1, 0), ((0, 1), 3, 0), ((1, 1), 1, -2), ((2, 1), 'Terminal')]
Episode 6 : [((0, 0), 0, 0), ((0, 0), 2, 0), ((0, 0), 3, 0), ((0, 1), 3, 0), ((0, 0), 2, 0), ((0, 0), 0, 0), ((0, 0), 1, 0), ((1, 0), 3, 0), ((1, 1), 1, -2), ((2, 1), 'Terminal')]
Episode 7 : [((0, 0), 0, 0), ((0, 1), 0, 0), ((0, 0), 0, 0), ((0, 0), 3, 0), ((0, 1), 0, 0), ((0, 1), 1, 0), ((1, 1), 0, 0), ((0

In [23]:
#Choose SARSA or QLearning (If you don't use SARSA, delete this code)
class QLearning:
  def __init__(self, env, gamma = 0.9, epsilon = 0.1, alpha = 0.2):  #You should determine epsilon and alpha by yourself
    #Initialize (5 pts)
    self.env = env
    self.gamma = gamma
    self.epsilon = epsilon
    self.alpha = alpha
    self.q_table = {state: np.zeros(len(self.env.action_space)) for state in self.env.state_space}

  def epsilon_greedy(self, state):
      if np.random.rand() < self.epsilon:
        return np.random.choice(self.env.action_space)
      else:
        return np.argmax(self.q_table[state])

  def control(self, episodes = 200):  #You should determine the number of episodes by yourself
    for i in range(1, episodes+1):
      if i % int(episodes/10) == 0:
        print('Episode ', i, '/', episodes, ': ', '[', '*'*int(i/(episodes/10)), '-'*int((episodes - i)/(episodes/10)), ']')

      #Initialize (do not change this code)
      state = self.env.reset()
      done = False

      while (not done):
        action = self.epsilon_greedy(state)
        next_state, reward, done = self.env.step(action)
        self.q_table[state][action] += self.alpha*(reward + self.gamma*max(self.q_table[next_state]) - self.q_table[state][action])
        state = next_state

    policy = {state: np.argmax(actions) for state, actions in self.q_table.items()}
    return policy, self.q_table

In [24]:
class double_Qlearning:
  def __init__(self, env, gamma = 0.9, epsilon = 0.1, alpha = 0.2):  #You should determine epsilon and alpha by yourself
    #Initialize (5 pts)
    self.env = env
    self.gamma = gamma
    self.epsilon = epsilon
    self.alpha = alpha
    self.q_table_action = {state: np.zeros(len(self.env.action_space)) for state in self.env.state_space}
    self.q_table_value = {state: np.zeros(len(self.env.action_space)) for state in self.env.state_space}

  def epsilon_greedy(self, state):
      if np.random.rand() < self.epsilon:
        return np.random.choice(self.env.action_space)
      else:
        return np.argmax(self.q_table_action[state])

  def control(self, episodes = 200):  #You should determine the number of episodes by yourself
    for i in range(1, episodes+1):
      if i % int(episodes/10) == 0:
        print('DQL Episode ', i, '/', episodes, ': ', '[', '*'*int(i/(episodes/10)), '-'*int((episodes - i)/(episodes/10)), ']')

      #Initialize (do not change this code)
      state = self.env.reset()
      done = False

      while (not done):
        action = self.epsilon_greedy(state)
        next_state, reward, done = self.env.step(action)
        A = np.argmax(self.q_table_action[next_state])
        self.q_table_action[state][action] += self.alpha*(reward + self.gamma*(self.q_table_value[next_state][A]) - self.q_table_action[state][action])
        state = next_state

    policy = {state: np.argmax(actions) for state, actions in self.q_table_action.items()}
    return policy, self.q_table_action

In [25]:
env = FrozenLake_Slippery()

#Determine the hyperparameters by yourself
QL_FrozenLake = QLearning(env, gamma = 0.9, epsilon = 0.1, alpha =0.2)
DQL_FrozenLake = double_Qlearning(env, gamma = 0.9, epsilon = 0.1, alpha =0.2)
opt_policy, q_table = QL_FrozenLake.control(episodes = 200)
DQL_opt_policy, q_table_action = DQL_FrozenLake.control(episodes = 200)

Episode  20 / 200 :  [ * --------- ]
Episode  40 / 200 :  [ ** -------- ]
Episode  60 / 200 :  [ *** ------- ]
Episode  80 / 200 :  [ **** ------ ]
Episode  100 / 200 :  [ ***** ----- ]
Episode  120 / 200 :  [ ****** ---- ]
Episode  140 / 200 :  [ ******* --- ]
Episode  160 / 200 :  [ ******** -- ]
Episode  180 / 200 :  [ ********* - ]
Episode  200 / 200 :  [ **********  ]
DQL Episode  20 / 200 :  [ * --------- ]
DQL Episode  40 / 200 :  [ ** -------- ]
DQL Episode  60 / 200 :  [ *** ------- ]
DQL Episode  80 / 200 :  [ **** ------ ]
DQL Episode  100 / 200 :  [ ***** ----- ]
DQL Episode  120 / 200 :  [ ****** ---- ]
DQL Episode  140 / 200 :  [ ******* --- ]
DQL Episode  160 / 200 :  [ ******** -- ]
DQL Episode  180 / 200 :  [ ********* - ]
DQL Episode  200 / 200 :  [ **********  ]


In [26]:
#Printing the optimal policy (10 pts)
policy = [[0 for _ in range(8)] for _ in range(4)]

for state, action in opt_policy.items():
  if action == 0:
    policy[state[0]][state[1]] = '^'
  elif action == 1:
    policy[state[0]][state[1]] = 'v'
  elif action == 2:
    policy[state[0]][state[1]] = '<'
  elif action == 3:
    policy[state[0]][state[1]] = '>'

policy[0][4], policy[1][5], policy[2][1], policy[2][7], policy[3][2] = 'H', 'H', 'H', 'H', 'H'
policy[3][7] = 'G'
policy

[['>', '>', '>', 'v', 'H', '>', '>', '^'],
 ['^', '<', '^', 'v', 'v', 'H', '>', '^'],
 ['^', 'H', '^', '>', 'v', '>', 'v', 'H'],
 ['<', '<', 'H', '>', '>', '>', '>', 'G']]

In [27]:
DQL_policy = [[0 for _ in range(8)] for _ in range(4)]

for state, action in DQL_opt_policy.items():
  if action == 0:
    DQL_policy[state[0]][state[1]] = '^'
  elif action == 1:
    DQL_policy[state[0]][state[1]] = 'v'
  elif action == 2:
    DQL_policy[state[0]][state[1]] = '<'
  elif action == 3:
    DQL_policy[state[0]][state[1]] = '>'

DQL_policy[0][4], DQL_policy[1][5], DQL_policy[2][1], DQL_policy[2][7], DQL_policy[3][2] = 'H', 'H', 'H', 'H', 'H'
DQL_policy[3][7] = 'G'
DQL_policy

[['^', '^', '^', '^', 'H', '^', '^', '^'],
 ['^', '>', '^', '^', 'v', 'H', '^', '^'],
 ['v', 'H', '^', '^', '^', '<', '^', 'H'],
 ['^', 'v', 'H', '^', '^', '^', '^', 'G']]