### Run in collab
<a href="https://colab.research.google.com/github/racousin/data_science_practice/blob/master/website/public/modules/data-science-practice/module9/exercise/module9_exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install swig==4.2.1
!pip install gymnasium==1.2.0

In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

# module9_exercise2 : ML - Arena <a href="https://ml-arena.com/viewcompetition/5" target="_blank"> FrozenLake Competition</a>


  https://gymnasium.farama.org/environments/toy_text/frozen_lake/

  observation is position, given by current_row * ncols + current_col
  reward is 0 if not on end 1 otherwise

  action :  0: Move left
            1: Move down
            2: Move right
            3: Move up
            If on an left edge and select, do not move, except if random sliding

### Objective
Get at list an agent running on ML-Arena <a href="https://ml-arena.com/viewcompetition/5" target="_blank"> FrozenLake Competition</a> with mean reward upper than 0.35 (ie 35%)


You should submit an agent file named `agent.py` with a class `Agent` that includes at least the following attributes:

In [47]:
def learn_Q(
    env,
    episodes=100000,
    learning_rate=0.1,
    gamma=0.99,
    epsilon=1.0,
    epsilon_decay=0.995,
    epsilon_min=0.01,
    verbose=True,
    Q=None
):
    if Q is None:
        Q = np.zeros((env.observation_space.n, env.action_space.n))
    else:
        Q = np.copy(Q)

    rewards = []

    for episode in range(episodes):
        reward = 0

        observation, _ = env.reset()
        action_reward, terminated, truncated, info = None, False, False, None

        while not (terminated or truncated):
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[observation])

            next_observation, action_reward, terminated, truncated, info = env.step(action)

            q = action_reward + (0 if (terminated or truncated) else gamma * np.max(Q[next_observation]))
            Q[observation, action] += learning_rate * (q - Q[observation, action])

            observation = next_observation
            reward += action_reward

        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        rewards.append(reward)

        if verbose and (episode + 1) % 1000 == 0:
            avg_reward = np.mean(rewards[-1000:])
            print(f"episode={episode + 1} reward={avg_reward:.3f} (epsilon={epsilon:.3f})")

    env.close()
    return Q, np.mean(rewards[-1000:])

In [37]:
import itertools

def optimize_learn_Q(
    env,
    episodes=10000,
    *,
    learning_rate,
    gamma,
    epsilon,
    epsilon_decay,
    epsilon_min,
    verbose=True
):
    best_params = None
    best_reward = 0
    best_Q = None

    for candidate_learning_rate, candidate_gamma, candidate_epsilon, candidate_epsilon_decay, candidate_epsilon_min in itertools.product(learning_rate, gamma, epsilon, epsilon_decay, epsilon_min):
        params=dict(
            learning_rate=candidate_learning_rate,
            gamma=candidate_gamma,
            epsilon=candidate_epsilon,
            epsilon_decay=candidate_epsilon_decay,
            epsilon_min=candidate_epsilon_min
        )

        Q, reward = learn_Q(
            env,
            episodes=episodes,
            verbose=False,
            **params
        )

        if reward > best_reward:
          best_reward = reward
          best_Q = Q
          best_params = params

        if verbose:
            print(f"reward={reward:.4f} {repr(params)}")

    return best_Q, best_reward, best_params


In [38]:
def eval_Q(
    env,
    Q,
    n=100,
    lives=10
):
    rewards = []

    for episode in range(n):
        reward = 0

        for life in range(lives):
            observation, _ = env.reset()
            action_reward, terminated, truncated, info = None, False, False, None

            while not (terminated or truncated):
                action = np.argmax(Q[observation])
                observation, action_reward, terminated, truncated, info = env.step(action)
                reward += action_reward

        rewards.append(reward)

    env.close()

    return np.mean(rewards), np.std(rewards)



In [45]:
env = gym.make("FrozenLake-v1", map_name="8x8")

# Q = learn_Q(env, episodes=200000, epsilon_decay=0.999975)

# Q, reward, params = optimize_learn_Q(
#     env,
#     learning_rate=[0.05, 0.1, 0.15, 0.2],
#     gamma=[0.9, 0.95, 0.99],
#     epsilon=[1.0],
#     epsilon_decay=[0.9999, 0.99995, 0.999975],
#     epsilon_min=[0.01],
# )

# Q, reward, params = optimize_learn_Q(
#     env,
#     learning_rate=[0.01, 0.025, 0.05, 0.075],
#     gamma=[0.97],
#     epsilon=[1.0],
#     epsilon_decay=[0.9999],
#     epsilon_min=[0.01],
# )

Q, reward = learn_Q(env, episodes=50000, epsilon_decay=0.9999, learning_rate=0.075, gamma=0.97)


episode=1000 reward=0.003 (epsilon=0.905)
episode=2000 reward=0.003 (epsilon=0.819)
episode=3000 reward=0.009 (epsilon=0.741)
episode=4000 reward=0.011 (epsilon=0.670)
episode=5000 reward=0.016 (epsilon=0.607)
episode=6000 reward=0.017 (epsilon=0.549)
episode=7000 reward=0.038 (epsilon=0.497)
episode=8000 reward=0.035 (epsilon=0.449)
episode=9000 reward=0.069 (epsilon=0.407)
episode=10000 reward=0.065 (epsilon=0.368)
episode=11000 reward=0.090 (epsilon=0.333)
episode=12000 reward=0.114 (epsilon=0.301)
episode=13000 reward=0.130 (epsilon=0.273)
episode=14000 reward=0.118 (epsilon=0.247)
episode=15000 reward=0.168 (epsilon=0.223)
episode=16000 reward=0.197 (epsilon=0.202)
episode=17000 reward=0.206 (epsilon=0.183)
episode=18000 reward=0.189 (epsilon=0.165)
episode=19000 reward=0.235 (epsilon=0.150)
episode=20000 reward=0.228 (epsilon=0.135)
episode=21000 reward=0.277 (epsilon=0.122)
episode=22000 reward=0.312 (epsilon=0.111)
episode=23000 reward=0.294 (epsilon=0.100)
episode=24000 reward

In [46]:
eval_Q(env, Q)

(np.float64(3.93), np.float64(1.5378881623837282))

In [49]:
best_Q = Q

Q, reward = learn_Q(env, episodes=50000, epsilon=0.25, epsilon_decay=0.9999, learning_rate=0.075, gamma=0.97, Q=best_Q)

episode=1000 reward=0.166 (epsilon=0.226)
episode=2000 reward=0.141 (epsilon=0.205)
episode=3000 reward=0.174 (epsilon=0.185)
episode=4000 reward=0.211 (epsilon=0.168)
episode=5000 reward=0.275 (epsilon=0.152)
episode=6000 reward=0.235 (epsilon=0.137)
episode=7000 reward=0.288 (epsilon=0.124)
episode=8000 reward=0.243 (epsilon=0.112)
episode=9000 reward=0.281 (epsilon=0.102)
episode=10000 reward=0.296 (epsilon=0.092)
episode=11000 reward=0.322 (epsilon=0.083)
episode=12000 reward=0.295 (epsilon=0.075)
episode=13000 reward=0.395 (epsilon=0.068)
episode=14000 reward=0.343 (epsilon=0.062)
episode=15000 reward=0.378 (epsilon=0.056)
episode=16000 reward=0.396 (epsilon=0.050)
episode=17000 reward=0.422 (epsilon=0.046)
episode=18000 reward=0.391 (epsilon=0.041)
episode=19000 reward=0.452 (epsilon=0.037)
episode=20000 reward=0.407 (epsilon=0.034)
episode=21000 reward=0.365 (epsilon=0.031)
episode=22000 reward=0.422 (epsilon=0.028)
episode=23000 reward=0.482 (epsilon=0.025)
episode=24000 reward

In [55]:
eval_Q(env, Q)

(np.float64(5.68), np.float64(1.5612815249018992))

In [51]:
np.save("Q.npy", Q)

In [56]:
import numpy as np

class Agent:
    def __init__(
        self,
        env,
    ):
        self.env = env
        self.Q = np.load("Q.npy")

    def choose_action(self, observation, reward=0.0, terminated=False, truncated=False, info = None):
        return np.argmax(self.Q[observation])



### Description

The game starts with the player at location [0,0] of the frozen lake grid world with the goal located at far extent of the world [7,7].

Holes in the ice are distributed in set locations.

The player makes moves until they reach the goal or fall in a hole.

Each run will consist of 10 attempts to cross the ice. The reward will be the total amount accumulated during those trips. For example, if your agent reaches the goal 3 times out of 10, its reward will be 3.

The environment is based on :

In [6]:
env = gym.make('FrozenLake-v1', map_name="8x8")

### Before submit
Test that your agent has the right attributes

In [59]:
env = gym.make('FrozenLake-v1', map_name="8x8")

agent = Agent(env)

observation, _ = env.reset()
reward, terminated, truncated, info = None, False, False, None
rewards = []
while not (terminated or truncated):
    action = agent.choose_action(observation, reward=reward, terminated=terminated, truncated=truncated, info=info)
    print(action)
    observation, reward, terminated, truncated, info = env.step(action)
    rewards.append(reward)
print(f'Cumulative Reward: {sum(rewards)}')

1
2
2
2
3
2
2
2
2
2
2
2
2
2
2
1
1
1
1
1
1
1
2
2
2
2
2
2
2
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
2
1
1
1
2
1
1
2
1
2
2
2
2
2
2
2
Cumulative Reward: 1.0
