In [19]:

class CliffWalking:

    def __init__(self):
        self.map = []
        self.actions = range(4)
        self.start = (3,0)
        self.current_state = self.start
        self.shape = (4, 12)
        #map generation
        self.s = self.to_s(3,0)
        for i in range(4):
            self.map.append([])
            for j in range(12):
                if i != 3:
                    self.map[i].append('G')
                else:
                    if j == 0:
                        self.map[i].append('S')
                    elif j == 11:
                        self.map[i].append('E')
                    else:
                        self.map[i].append('H')
    def to_s(self,row,col):
            return row*12+col

    def step(self, action):
        row,col = self.current_state
        if action == 0:  # Up
            row = max(row - 1, 0)
        elif action == 1:  # Right
            col = min(col + 1, self.shape[1] - 1)
        elif action == 2:  # Down
            row = min(row + 1, self.shape[0] - 1)
        elif action == 3:  # Left
            col = max(col - 1, 0)
        new_row = row
        new_col = col
        done = False
        new_state = (new_row, new_col)
        self.current_state = new_state

        if self.map[new_row][new_col] == 'G':
            reward = -1
        elif self.map[new_row][new_col] == 'S':
            reward = -1
        elif self.map[new_row][new_col] == 'E':
            reward =10
            done = True
        elif self.map[new_row][new_col] == 'H':
            reward = -100
            done = True
            self.current_state = self.start
        else:
            reward = 0
        return self.to_s(new_state[0],new_state[1]),reward,done

    def reset(self):
        self.current_state = self.start
        return self.to_s(self.current_state[0],self.current_state[1])

In [20]:
#Testing env
env = CliffWalking()
actions = [0,0,1,1,1,1,1,1,1,1,1,1,1,2,2]
for action in actions:
    state,reward,done = env.step(action)
    print(state,reward,done)
print("          ")
env.reset()
actions = [0,0,1,1,1,1,1,1,1,1,1,1,2,2]
for action in actions:
    state,reward,done = env.step(action)
    print(state,reward,done)

24 -1 False
12 -1 False
13 -1 False
14 -1 False
15 -1 False
16 -1 False
17 -1 False
18 -1 False
19 -1 False
20 -1 False
21 -1 False
22 -1 False
23 -1 False
35 -1 False
47 10 True
          
24 -1 False
12 -1 False
13 -1 False
14 -1 False
15 -1 False
16 -1 False
17 -1 False
18 -1 False
19 -1 False
20 -1 False
21 -1 False
22 -1 False
34 -1 False
46 -100 True


In [21]:
import numpy as np
#Q-learning
Q = np.zeros((48,len(env.actions)))
state = env.reset()
episode = 100000
gamma = 0.99
alpha = 0.01
epsilon = 0.4

for episode in range(episode):
    state = env.reset()
    done = False

    while not done:
        if np.random.rand() < epsilon:
            action = np.random.randint(4)
        else:
            action = np.argmax(Q[state])
        next_state, reward, done = env.step(action)

        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        state = next_state

In [22]:
optimal_policy_1 = np.array([np.argmax(q) for q in Q])
print(optimal_policy_1)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2 0
 0 0 0 0 0 0 0 0 0 0 0]


In [23]:
def greedy_test(optimal_policy):
    state = env.reset()
    max_step = 100
    tests = 100
    sucsess = 0
    for j in range(tests):
        done = False
        while not done:
            state,reward,done = env.step(optimal_policy[state])
            print(state//12,state%12,reward,done)
            if done:
                if reward == 10:
                    sucsess += 1
                    state = env.reset()
    print(sucsess/tests*100)
greedy_test(optimal_policy_1)

2 0 -1 False
2 1 -1 False
2 2 -1 False
2 3 -1 False
2 4 -1 False
2 5 -1 False
2 6 -1 False
2 7 -1 False
2 8 -1 False
2 9 -1 False
2 10 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
2 1 -1 False
2 2 -1 False
2 3 -1 False
2 4 -1 False
2 5 -1 False
2 6 -1 False
2 7 -1 False
2 8 -1 False
2 9 -1 False
2 10 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
2 1 -1 False
2 2 -1 False
2 3 -1 False
2 4 -1 False
2 5 -1 False
2 6 -1 False
2 7 -1 False
2 8 -1 False
2 9 -1 False
2 10 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
2 1 -1 False
2 2 -1 False
2 3 -1 False
2 4 -1 False
2 5 -1 False
2 6 -1 False
2 7 -1 False
2 8 -1 False
2 9 -1 False
2 10 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
2 1 -1 False
2 2 -1 False
2 3 -1 False
2 4 -1 False
2 5 -1 False
2 6 -1 False
2 7 -1 False
2 8 -1 False
2 9 -1 False
2 10 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
2 1 -1 False
2 2 -1 False
2 3 -1 False
2 4 -1 False
2 5 -1 False
2 6 -1 False
2 7 -1 False
2 8 -1 False
2 9 -1 False
2 10 -1 False
2

## Q learning conclusion

If we look closely We can see that path followed by q learning is a risky but short path. It goes up then goes right across the map along the cliff to reach goal.

This is because Q learing explores a lot in the beging but even during exploration it takes the path with maximum reward so it follows the path along the clif.

Q learning is off-policy it gives the value estimantes for the optimal policy

Here u can see the sucsess rate is 100% but if we use the policy that we trained on we can see the results here

In [24]:
def epsilon_greedy_test(optimal_policy):
    state = env.reset()
    max_step = 100
    tests = 100
    sucsess = 0
    epsilon = 0.4
    for j in range(tests):
        done = False
        while not done:
            if np.random.rand() < epsilon:
                action = np.random.randint(4)
            else:
                action = optimal_policy[state]
            state,reward,done = env.step(action)
            if done:
                if reward == 10:
                    sucsess += 1
    print(sucsess/tests*100)
epsilon_greedy_test(optimal_policy_1)

66.0


Running this code multiple times shows that the success rate is different is somewhere near than 50% - 70% on average this is since the Q learing produced the optimal actions for the greedy policy and not the policy that it trained on

In [25]:
# Sarsa
import numpy as np
Q = np.zeros((48,len(env.actions)))
state = env.reset()
episode = 100000
gamma = 0.99
alpha = 0.01
epsilon = 0.2

for episode in range(episode):
    state = env.reset()
    done = False
    if np.random.rand() < epsilon:
            action = np.random.randint(4)
    else:
            action = np.argmax(Q[state])
    while not done:
        next_state, reward, done = env.step(action)

        if np.random.rand() < epsilon:
            action_nxt = np.random.randint(4)
        else:
            action_nxt = np.argmax(Q[next_state])

        Q[state, action] += alpha * (reward + gamma * (Q[next_state][action_nxt]) - Q[state, action])
        #epsilon = max(epsilon*0.99, 0.01)
        state = next_state
        action = action_nxt


In [26]:
optimal_policy_2 = np.array([np.argmax(q) for q in Q])

In [27]:
greedy_test(optimal_policy_2)

2 0 -1 False
1 0 -1 False
0 0 -1 False
0 1 -1 False
0 2 -1 False
0 3 -1 False
0 4 -1 False
0 5 -1 False
0 6 -1 False
0 7 -1 False
0 8 -1 False
0 9 -1 False
0 10 -1 False
0 11 -1 False
1 11 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
1 0 -1 False
0 0 -1 False
0 1 -1 False
0 2 -1 False
0 3 -1 False
0 4 -1 False
0 5 -1 False
0 6 -1 False
0 7 -1 False
0 8 -1 False
0 9 -1 False
0 10 -1 False
0 11 -1 False
1 11 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
1 0 -1 False
0 0 -1 False
0 1 -1 False
0 2 -1 False
0 3 -1 False
0 4 -1 False
0 5 -1 False
0 6 -1 False
0 7 -1 False
0 8 -1 False
0 9 -1 False
0 10 -1 False
0 11 -1 False
1 11 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
1 0 -1 False
0 0 -1 False
0 1 -1 False
0 2 -1 False
0 3 -1 False
0 4 -1 False
0 5 -1 False
0 6 -1 False
0 7 -1 False
0 8 -1 False
0 9 -1 False
0 10 -1 False
0 11 -1 False
1 11 -1 False
2 11 -1 False
3 11 10 True
2 0 -1 False
1 0 -1 False
0 0 -1 False
0 1 -1 False
0 2 -1 False
0 3 -1 False
0 4 -1 False
0 5 -1 Fa

In [28]:
epsilon_greedy_test(optimal_policy_2)

88.0


As you can see here the success rate of SARSA in thr environment which it was trained on is close to 90% since it follows the safer path