In [1]:
import numpy as np
import gymnasium as gym
import pickle as pkl

In [2]:
cliffEnv = gym.make('CliffWalking-v0', render_mode="ansi")
print("Observation Space:", cliffEnv.observation_space)
print("Action Space:", cliffEnv.action_space)

Observation Space: Discrete(48)
Action Space: Discrete(4)


In [3]:
print(cliffEnv.reset())
print(cliffEnv.step(0))

(36, {'prob': 1})
(24, -1, False, False, {'prob': 1.0})


### Random Agent

In [4]:
state, info = cliffEnv.reset()
done = False

for i in range(10):
    action = cliffEnv.action_space.sample()
    state, reward, terminated, truncated, info = cliffEnv.step(action)
    print(cliffEnv.render())
    if terminated or truncated:
        done = True
        print("State:", state)
        print("Reward:", reward)

cliffEnv.close()

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T


o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T


o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T


o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T


o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T


o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T


o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T


o  o  

### SARSA

In [5]:
cliffEnv = gym.make('CliffWalking-v0')

EPSILON = 0.1
GAMMA = 0.9
ALPHA = 0.1
NUM_EPISODES = 500


def greedy_policy(state, epsilon):
    if np.random.random() <= epsilon:
        return cliffEnv.action_space.sample()
    else:
        return np.argmax(Q_TABLE[state])
    

Q_TABLE = np.zeros([cliffEnv.observation_space.n, cliffEnv.action_space.n])

for episode in range(NUM_EPISODES):
    state, info = cliffEnv.reset()
    action = greedy_policy(state, EPSILON)
    done = False

    total_reward = 0
    episode_len = 0

    while not done:
        next_state, reward, terminated, truncated, info = cliffEnv.step(action)
        next_action = greedy_policy(next_state, EPSILON)

        Q_TABLE[state][action] = Q_TABLE[state][action] + ALPHA * (reward + GAMMA * Q_TABLE[next_state][next_action] - Q_TABLE[state][action])

        state = next_state
        action = next_action

        total_reward += reward
        episode_len += 1

        done = terminated or truncated

    print(f"Episode {episode}: Total Reward: {total_reward}, Episode Length: {episode_len}")

pkl.dump(Q_TABLE, open("models/SARSA_q_table.pkl", "wb"))
print("Training Completed!")

cliffEnv.close()

Episode 0: Total Reward: -111, Episode Length: 111
Episode 1: Total Reward: -2186, Episode Length: 899
Episode 2: Total Reward: -618, Episode Length: 222
Episode 3: Total Reward: -143, Episode Length: 143
Episode 4: Total Reward: -208, Episode Length: 208
Episode 5: Total Reward: -315, Episode Length: 216
Episode 6: Total Reward: -85, Episode Length: 85
Episode 7: Total Reward: -233, Episode Length: 134
Episode 8: Total Reward: -69, Episode Length: 69
Episode 9: Total Reward: -667, Episode Length: 469
Episode 10: Total Reward: -137, Episode Length: 137
Episode 11: Total Reward: -200, Episode Length: 101
Episode 12: Total Reward: -208, Episode Length: 109
Episode 13: Total Reward: -447, Episode Length: 249
Episode 14: Total Reward: -54, Episode Length: 54
Episode 15: Total Reward: -337, Episode Length: 238
Episode 16: Total Reward: -23, Episode Length: 23
Episode 17: Total Reward: -403, Episode Length: 205
Episode 18: Total Reward: -65, Episode Length: 65
Episode 19: Total Reward: -123,

In [6]:
cliffEnv = gym.make('CliffWalking-v0', render_mode="human")

Q_TABLE = pkl.load(open("models/SARSA_q_table.pkl", "rb"))

NUM_EPISODES = 10

for episode in range(NUM_EPISODES):
    reward = 0
    episode_len = 0
    
    state, info = cliffEnv.reset()
    done = False

    while not done:
        action = np.argmax(Q_TABLE[state])
        state, reward, terminated, truncated, info = cliffEnv.step(action)

        total_reward += reward
        episode_len += 1

        done = terminated or truncated

    print(f"Episode {episode}: Total Reward: {total_reward}, Episode Length: {episode_len}")
    
cliffEnv.close()

Episode 0: Total Reward: -36, Episode Length: 17
Episode 1: Total Reward: -53, Episode Length: 17
Episode 2: Total Reward: -70, Episode Length: 17
Episode 3: Total Reward: -87, Episode Length: 17
Episode 4: Total Reward: -104, Episode Length: 17
Episode 5: Total Reward: -121, Episode Length: 17
Episode 6: Total Reward: -138, Episode Length: 17
Episode 7: Total Reward: -155, Episode Length: 17
Episode 8: Total Reward: -172, Episode Length: 17
Episode 9: Total Reward: -189, Episode Length: 17


### Q-Learning

In [7]:
cliffEnv = gym.make('CliffWalking-v0')

EPSILON = 0.1
GAMMA = 0.9
ALPHA = 0.1
NUM_EPISODES = 500


def greedy_policy(state, epsilon=0.0):
    if np.random.random() <= epsilon:
        return cliffEnv.action_space.sample()
    else:
        return np.argmax(Q_TABLE[state])
    

Q_TABLE = np.zeros([cliffEnv.observation_space.n, cliffEnv.action_space.n])

for episode in range(NUM_EPISODES):
    state, info = cliffEnv.reset()
    action = greedy_policy(state, EPSILON)
    done = False

    total_reward = 0
    episode_len = 0

    while not done:
        action = greedy_policy(state, EPSILON)
        next_state, reward, terminated, truncated, info = cliffEnv.step(action)
        next_action = greedy_policy(next_state)

        Q_TABLE[state][action] = Q_TABLE[state][action] + ALPHA * (reward + GAMMA * Q_TABLE[next_state][next_action] - Q_TABLE[state][action])

        state = next_state

        total_reward += reward
        episode_len += 1

        done = terminated or truncated

    print(f"Episode {episode}: Total Reward: {total_reward}, Episode Length: {episode_len}")

pkl.dump(Q_TABLE, open("models/Qlearning_q_table.pkl", "wb"))
print("Training Completed!")

cliffEnv.close()

Episode 0: Total Reward: -587, Episode Length: 191
Episode 1: Total Reward: -2283, Episode Length: 897
Episode 2: Total Reward: -250, Episode Length: 250
Episode 3: Total Reward: -261, Episode Length: 162
Episode 4: Total Reward: -276, Episode Length: 276
Episode 5: Total Reward: -37, Episode Length: 37
Episode 6: Total Reward: -833, Episode Length: 338
Episode 7: Total Reward: -220, Episode Length: 121
Episode 8: Total Reward: -213, Episode Length: 114
Episode 9: Total Reward: -124, Episode Length: 124
Episode 10: Total Reward: -441, Episode Length: 243
Episode 11: Total Reward: -119, Episode Length: 119
Episode 12: Total Reward: -42, Episode Length: 42
Episode 13: Total Reward: -461, Episode Length: 164
Episode 14: Total Reward: -112, Episode Length: 112
Episode 15: Total Reward: -336, Episode Length: 138
Episode 16: Total Reward: -127, Episode Length: 127
Episode 17: Total Reward: -55, Episode Length: 55
Episode 18: Total Reward: -144, Episode Length: 144
Episode 19: Total Reward: -

In [2]:
cliffEnv = gym.make('CliffWalking-v0', render_mode="human")

Q_TABLE = pkl.load(open("models/Qlearning_q_table.pkl", "rb"))

NUM_EPISODES = 10

for episode in range(NUM_EPISODES):
    total_reward = 0
    episode_len = 0
    
    state, info = cliffEnv.reset()
    done = False

    while not done:
        action = np.argmax(Q_TABLE[state])
        state, reward, terminated, truncated, info = cliffEnv.step(action)

        total_reward += reward
        episode_len += 1

        done = terminated or truncated

    print(f"Episode {episode}: Total Reward: {total_reward}, Episode Length: {episode_len}")
    
cliffEnv.close()

Episode 0: Total Reward: -13, Episode Length: 13
Episode 1: Total Reward: -13, Episode Length: 13
Episode 2: Total Reward: -13, Episode Length: 13
Episode 3: Total Reward: -13, Episode Length: 13
Episode 4: Total Reward: -13, Episode Length: 13
Episode 5: Total Reward: -13, Episode Length: 13
Episode 6: Total Reward: -13, Episode Length: 13
Episode 7: Total Reward: -13, Episode Length: 13
Episode 8: Total Reward: -13, Episode Length: 13
Episode 9: Total Reward: -13, Episode Length: 13
