In [1]:
import numpy as np
import gymnasium as gym 
import random

In [2]:
env_headless = gym.make('FrozenLake-v1',is_slippery=False)
env_graphic = gym.make("FrozenLake-v1", render_mode="human", is_slippery=False)
env = env_headless

In [3]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
env.action_space, env.observation_space

(Discrete(4), Discrete(16))

In [4]:
qtable = np.zeros((state_space_size, action_space_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
total_episodes = 5000
learning_rate = 0.2
max_steps = 100
gamma = 0.9
epsilon = 1
max_epsilon = 1
min_epsilon = 0.01
decay_rate = 0.001
total_runs = 0

## Q-Learning

In [6]:
for blah in range(1):
    rewards = []
    for episode in range(total_episodes):
        if episode % 500 == 0:
            print("Progress at ", episode)
            env = env_graphic
        else:
            env = env_headless
        state = env.reset()[0]
        step = 0
        done = False
        total_rewards = 0
        total_runs = total_runs+1
        for step in range(max_steps):
            if random.uniform(0,1) > epsilon:
                action = np.argmax(qtable[state,:])
            else:
                action = env.action_space.sample()
    
            new_state, reward, done, info, extra = env.step(action)
    
    #        print(state, new_state, reward, done)
            
            max_new_state = np.max(qtable[new_state, :])
            
            qtable[state,action] = (1 - learning_rate) * qtable[state,action] + learning_rate * (reward + gamma * max_new_state - qtable[state, action])
            total_rewards += reward
            state = new_state
        
            if done:
                rewards.append(total_rewards)
                break
    
            epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
            
    print("Total runs: ", total_runs, "Score:", str(sum(rewards)/total_episodes))
#print(qtable)
#total_runs

Progress at  0
Progress at  500
Progress at  1000
Progress at  1500
Progress at  2000
Progress at  2500
Progress at  3000
Progress at  3500
Progress at  4000
Progress at  4500
Total runs:  5000 Score: 0.7594


In [8]:
env = env_graphic
env.reset()

for episode in range(1):
    state = env.reset()[0]
    step = 0
    done = False

    print("Episode:", episode+1)
    for step in range(max_steps):
        action = np.argmax(qtable[state,:])
        new_state, reward, done, info, extra = env.step(action)
        if done:
            print("num of steps", step)
            break;
        state = new_state


Episode: 1


## SARSA 

In [6]:
qtable = np.zeros((state_space_size, action_space_size))

for blah in range(1):
    rewards = []
    for episode in range(total_episodes):
        if episode % 200 == 0:
            print("Progress at ", episode)
            env = env_graphic
        else:
            env = env_headless
        state = env.reset()[0]
        step = 0
        done = False
        total_rewards = 0
        total_runs = total_runs+1
        for step in range(max_steps):
            if random.uniform(0,1) > epsilon:
                action = np.argmax(qtable[state,:])
            else:
                action = env.action_space.sample()

            
            if random.uniform(0,1) > epsilon:
                new_action = np.argmax(qtable[state,:])
            else:
                new_action = env.action_space.sample()

            new_state, reward, done, info, extra = env.step(action)

            if random.uniform(0,1) > epsilon:
                new_action = np.argmax(qtable[new_state,new_action])
            else:
                new_action = env.action_space.sample()

    #        print(state, new_state, reward, done)
            
            sarsa_new_state = np.max(qtable[new_state, :])
            
            qtable[state,action] = (1 - learning_rate) * qtable[state,action] + learning_rate * (reward + gamma * sarsa_new_state - qtable[state, action])
            total_rewards += reward
            state = new_state
        
            if done:
                rewards.append(total_rewards)
                break
    
            epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
            
    print("Total runs: ", total_runs, "Score:", str(sum(rewards)/total_episodes))
print(qtable)

Progress at  0
Progress at  200
Progress at  400
Progress at  600
Progress at  800
Progress at  1000
Progress at  1200
Progress at  1400
Progress at  1600
Progress at  1800
Progress at  2000
Progress at  2200
Progress at  2400
Progress at  2600
Progress at  2800
Progress at  3000
Progress at  3200
Progress at  3400
Progress at  3600
Progress at  3800
Progress at  4000
Progress at  4200
Progress at  4400
Progress at  4600
Progress at  4800
Total runs:  5000 Score: 0.7624
[[0.00415188 0.00922641 0.00922641 0.00415188]
 [0.00415188 0.         0.02050313 0.00922641]
 [0.00922641 0.0455625  0.00922605 0.02050312]
 [0.02050301 0.         0.00904469 0.00915977]
 [0.00922641 0.02050313 0.         0.00415188]
 [0.         0.         0.         0.        ]
 [0.         0.10125    0.         0.02050312]
 [0.         0.         0.         0.        ]
 [0.02050313 0.         0.0455625  0.00922641]
 [0.02050313 0.10125    0.10125    0.        ]
 [0.0455625  0.225      0.         0.0455625 ]
 [0.    

In [7]:
env_headless.close()
env_graphic.close()