In [None]:
import gymnasium as gym
import time
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
from dqn_agent import Agent

In [None]:
env = gym.make('FrozenLake-v1',is_slippery=False) # make actions deterministic
print('State shape: ', env.observation_space.n)
print('Number of actions: ', env.action_space.n)

In [None]:
state_size = env.observation_space.n
action_size = env.action_space.n
agent = Agent(int(state_size), int(action_size) , seed=0)

In [None]:
def preprocess_state(state, state_size):
    state = np.identity(state_size)[state]
    return state.astype(np.float32)



In [None]:
def dqn(n_episodes=5000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        state,info = env.reset()
        state = preprocess_state(state, agent.state_size) 
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, info,_ = env.step(action)
            next_state = preprocess_state(next_state, agent.state_size)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        #print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=1.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint_frozen_lake.pth')
            break
    return scores

scores = dqn()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
env=gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False,render_mode='human')

In [None]:
agent.qnetwork_local.load_state_dict(torch.load('checkpoint_frozen_lake.pth'))

for i in range(3):
    state,info = env.reset()
    state = preprocess_state(state, agent.state_size) 
    for j in range(400):
        action = agent.act(state)
        env.render()
        time.sleep(0.3)
        state, reward,terminated,truncated,info= env.step(action)
        state = preprocess_state(state, agent.state_size) 
        if terminated or truncated:
            break 
            
env.close()