In [1]:
import gymnasium as gym
import gridworld
import numpy as np

def map_to_state(grid_position, grid)-> int:
    max_cols = grid[1]
    return grid_position[0] * max_cols + grid_position[1] 

def mu_policy(Qs, epsilon) -> int:

    N = len(Qs)

    # Azione random
    if np.random.rand() < epsilon:
        action = np.random.randint(0,N)
        
    # Azione greedy 
    else:
        action = pi_policy(Qs)
        
    return action

# La polica pi Ã¨ puramente GREEDY: Sceglie sempre l'azione con il valore massimo di Q(s,a)
def pi_policy(Qs):

    Qmax = np.max(Qs)
    id = np.where(Qs == Qmax)[0]
    
    if len(id) > 1:
        action = np.random.choice(id)
    else:
        action = id[0]
    return action

In [2]:
env = gym.make("Cliff-v0")

max_epsilon = 1.0                  # Exploration probability at start
min_epsilon = 0.05                 # Minimum exploration probability (start exploiting)
decay_rate = 0.00005                # Exponential decay rate for exploration prob

gamma = 0.99
num_learning_episodes = 100000
num_show_episodes = 20
goal = 0

# Vettore contenente gli stati
states = np.array([], dtype=int)

# Dimensioni griglia
max_rows = 4
max_cols = 12
grid = [max_rows, max_cols]

# Trasformo tutte le posizioni della griglia in stati 
for i in range(max_rows):
    for j in range(max_cols):
        curr_state = map_to_state([i,j], grid)
        states = np.append(states, curr_state)
        

# Dizionario contenente le Action Value Function per ogni stato
# e dizionario counter coppia stato-azione
Q ={}
N = {}
for s in states:
    Q[s] = np.zeros(4)
    N[s] = np.zeros(4)
    
for ep in range(1, num_learning_episodes+1):
    
    epsilon = np.max([min_epsilon, max_epsilon * np.exp(-decay_rate * ep)])
    curr_cell, info = env.reset()
    curr_state = map_to_state(curr_cell, grid)
    done = False
    steps = 0
    
    # Sequenza EP
    state_seq = np.array([], dtype=int)
    action_seq = np.array([], dtype=int)
    reward_seq = np.array([], dtype=int)

    while not done:
        
        action = mu_policy(Q[curr_state], epsilon)
        next_cell, reward, terminated, truncated, info = env.step(action)
        if terminated and reward == -1:
            goal +=1 
        
        state_seq = np.append(state_seq, curr_state)
        action_seq = np.append(action_seq, action)
        reward_seq = np.append(reward_seq, reward)

        curr_state = map_to_state(next_cell, grid)
        steps += 1
        
        done = terminated or truncated

    if ep % 1000 == 0:
        print(ep)
        print(epsilon)
        print("Goal reached %d times" % goal)
    if not truncated:    
        for i in range(len(state_seq)):
            s = state_seq[i]
            a = action_seq[i]
            N[s][a] += 1
            
            discount_factors = np.power(gamma, np.arange(len(reward_seq[i:])))
            Gt = discounted_return = np.sum(reward_seq[i:] * discount_factors)
            Q[s][a] = Q[s][a] + (Gt - Q[s][a])/N[s][a]
        
env.close()



1000
0.951229424500714
Goal reached 1 times
2000
0.9048374180359595
Goal reached 8 times
3000
0.8607079764250578
Goal reached 25 times
4000
0.8187307530779818
Goal reached 82 times
5000
0.7788007830714049
Goal reached 213 times
6000
0.7408182206817179
Goal reached 388 times
7000
0.7046880897187134
Goal reached 651 times
8000
0.6703200460356393
Goal reached 960 times
9000
0.6376281516217733
Goal reached 1339 times
10000
0.6065306597126334
Goal reached 1781 times
11000
0.5769498103804866
Goal reached 2251 times
12000
0.5488116360940265
Goal reached 2782 times
13000
0.522045776761016
Goal reached 3365 times
14000
0.49658530379140947
Goal reached 3983 times
15000
0.4723665527410147
Goal reached 4649 times
16000
0.44932896411722156
Goal reached 5350 times
17000
0.42741493194872665
Goal reached 6085 times
18000
0.4065696597405991
Goal reached 6851 times
19000
0.3867410234545012
Goal reached 7630 times
20000
0.36787944117144233
Goal reached 8427 times
21000
0.3499377491111553
Goal reached 921

In [3]:
env = gym.make("Cliff-v0", render_mode='human')
for ep in range(5):

    curr_cell, info = env.reset()
    curr_state = map_to_state(curr_cell, grid)
    done = False
    steps = 0

    while not done:
        
        action = mu_policy(Q[curr_state], 0)
        next_cell, reward, terminated, truncated, info = env.step(action)

        curr_state = map_to_state(next_cell, grid)
        if reward == 0:
            print("GOAL REACHED!")
    
        steps += 1
        
        done = terminated or truncated
env.close()