In [1]:
import gymnasium as gym
import gridworld
import numpy as np

def map_to_state(grid_position, grid)-> int:
    max_cols = grid[1]
    return grid_position[0] * max_cols + grid_position[1] 

def mu_policy(Qs, epsilon) -> int:

    N = len(Qs)

    # Azione random
    if np.random.rand() < epsilon:
        action = np.random.randint(0,N)
        
    # Azione greedy 
    else:
        action = pi_policy(Qs)
        
    return action

# La polica pi è puramente GREEDY: Sceglie sempre l'azione con il valore massimo di Q(s,a)
def pi_policy(Qs):

    Qmax = np.max(Qs)
    id = np.where(Qs == Qmax)[0]
    
    if len(id) > 1:
        action = np.random.choice(id)
    else:
        action = id[0]
    return action

In [5]:
env = gym.make("Cliff-v0")

max_epsilon = 1.0                  # Exploration probability at start
min_epsilon = 0.0                 # Minimum exploration probability (start exploiting)
decay_rate = 0.005               # Exponential decay rate for exploration prob
num_episodes = 1000
gamma = 0.99

goal = 0

# Vettore contenente gli stati
states = np.array([], dtype=int)

# Dimensioni griglia
max_rows = 4
max_cols = 12
grid = [max_rows, max_cols]

# Trasformo tutte le posizioni della griglia in stati 
for i in range(max_rows):
    for j in range(max_cols):
        curr_state = map_to_state([i,j], grid)
        states = np.append(states, curr_state)
        

# Dizionario contenente le Action Value Function per ogni stato
# e dizionario counter coppia stato-azione
Q ={}
N = {}
for s in states:
    Q[s] = np.zeros(4)
    N[s] = np.zeros(4)
    
for ep in range(1, num_episodes+1):
    
    epsilon = np.max([min_epsilon, max_epsilon * np.exp(-decay_rate * ep)])
    curr_cell, info = env.reset()
    curr_state = map_to_state(curr_cell, grid)
    done = False
    steps = 0
    
    # Sequenza EP
    state_seq = np.array([], dtype=int)
    action_seq = np.array([], dtype=int)
    reward_seq = np.array([], dtype=int)

    while not done:
        
        action = mu_policy(Q[curr_state], epsilon)
        next_cell, reward, terminated, truncated, info = env.step(action)
        if terminated and reward == -1:
            goal +=1 
        
        state_seq = np.append(state_seq, curr_state)
        action_seq = np.append(action_seq, action)
        reward_seq = np.append(reward_seq, reward)

        curr_state = map_to_state(next_cell, grid)
        steps += 1
        
        done = terminated or truncated

    if ep % 100 == 0:
        print(ep)
        print(epsilon)
        print("Goal reached %d times" % goal)
    if not truncated:    
        for i in range(len(state_seq)):
            s = state_seq[i]
            a = action_seq[i]
            N[s][a] += 1
            
            discount_factors = np.power(gamma, np.arange(len(reward_seq[i:])))
            Gt = discounted_return = np.sum(reward_seq[i:] * discount_factors)
            Q[s][a] = Q[s][a] + (Gt - Q[s][a])/N[s][a]
        
env.close()



100
0.6065306597126334
Goal reached 1 times
200
0.36787944117144233
Goal reached 33 times
300
0.22313016014842982
Goal reached 102 times
400
0.1353352832366127
Goal reached 184 times
500
0.0820849986238988
Goal reached 272 times
600
0.049787068367863944
Goal reached 367 times
700
0.0301973834223185
Goal reached 464 times
800
0.01831563888873418
Goal reached 562 times
900
0.011108996538242306
Goal reached 662 times
1000
0.006737946999085467
Goal reached 762 times


In [6]:
env = gym.make("Cliff-v0", render_mode='human')
for ep in range(5):

    curr_cell, info = env.reset()
    curr_state = map_to_state(curr_cell, grid)
    done = False
    steps = 0

    while not done:
        
        action = mu_policy(Q[curr_state], 0)
        next_cell, reward, terminated, truncated, info = env.step(action)

        curr_state = map_to_state(next_cell, grid)
        if reward == 0:
            print("GOAL REACHED!")
    
        steps += 1
        
        done = terminated or truncated
env.close()