In [1]:
import gymnasium as gym
import torch
import torch.optim as optim
import numpy as np
import time
import matplotlib.pyplot as plt

import my_package
from my_package import DQN, ReplayBuffer, select_action, optimize_model, eps_decay, soft_update

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

gamma = 0.99
alpha = 0.0001

eps_max = 1.0
eps_min = 0.1
exploration_fraction = 1.0

hidden_layer_dim = 128
target_soft_update = True
tau = 0.001

buffer_size = 100000
batch_size = 256
max_episodes = 10
data_window = 1

In [9]:
env = gym.make('ShipQuest-v0', prox_sensor=True)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

""" Init neural networks """
policy_net = DQN(
    state_dim=state_dim,
    action_dim=action_dim,
    device=device,
    hidden_dim=hidden_layer_dim,
)

target_net = DQN(
    state_dim=state_dim,
    action_dim=action_dim,
    device=device,
    hidden_dim=hidden_layer_dim,
)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=alpha)

""" Init buffer """
buffer = ReplayBuffer(capacity=buffer_size)

""" Data collect variables """
total_rewards = np.zeros(max_episodes)
eps_history = np.zeros(max_episodes)
len_episodes = np.zeros(max_episodes)
loss_history = []
reward_collected_per_ep = np.zeros(max_episodes)
start_time = time.time()
with torch.autograd.profiler.profile(use_device='cuda') as prof:

    for episode in range(max_episodes):
        state, info = env.reset()
        done = False
        epsilon = eps_decay(episode, max_episodes, eps_min, eps_max, exploration_fraction, 'linear')
        eps_history[episode] = epsilon
        
        while not done:
            action = select_action(state, policy_net, epsilon, action_dim)
            next_state, reward, terminated, truncated, info = env.step(action)
            buffer.push(state, action, reward, next_state, terminated)
            done = terminated or truncated
    
            # Ottimizzo policy net e soft update target net
            if len(buffer) > batch_size:
                loss = optimize_model(policy_net, target_net, buffer, optimizer, batch_size, gamma, debug=True)
                if target_soft_update:
                    soft_update(policy_net, target_net, tau)
                # loss_history.append(loss.cpu().detach().numpy())
                loss_history.append(loss.item())
    
            total_rewards[episode] += reward
            len_episodes[episode] += 1
            state = next_state
    
    
        if episode % data_window == 0 and episode != 0:
            ma_reward = np.mean(total_rewards[episode-data_window:episode])
            ma_loss = np.mean(loss_history[-data_window:])
            print(f"Ep {episode}/{max_episodes}, MA Reward: {ma_reward:.2f}, MA loss: {ma_loss:.4f}, Eps: {epsilon:.2f}")
    
    env.close()
print(prof.key_averages().table(sort_by="cuda_time_total"))

# total_time_seconds = time.time() - start_time
# hours = int(total_time_seconds // 3600)
# minutes = int((total_time_seconds % 3600) // 60)
# seconds = int(total_time_seconds % 60)
# print(f"Training finito in: {hours} ore {minutes} minuti e {seconds} secondi")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Ep 1/10, MA Reward: -9.73, MA loss: nan, Eps: 0.90
Ep 2/10, MA Reward: -9.52, MA loss: 1.1598, Eps: 0.80
Ep 3/10, MA Reward: -10.15, MA loss: 0.0267, Eps: 0.70
Ep 4/10, MA Reward: -11.62, MA loss: 1.0475, Eps: 0.60
Ep 5/10, MA Reward: -11.15, MA loss: 0.0446, Eps: 0.50
Ep 6/10, MA Reward: -10.65, MA loss: 0.3670, Eps: 0.40
Ep 7/10, MA Reward: -11.67, MA loss: 1.2424, Eps: 0.30
Ep 8/10, MA Reward: -12.77, MA loss: 0.8690, Eps: 0.20
Ep 9/10, MA Reward: -10.37, MA loss: 0.0107, Eps: 0.10
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  

In [None]:
""" Plot rewards """
window_size = 50
ma_reward = np.convolve(total_rewards, np.ones(window_size) / window_size, mode='valid')
plt.figure(figsize=(10, 6))
plt.scatter(np.arange(len(total_rewards)), total_rewards)
plt.plot(np.arange(window_size - 1, max_episodes), ma_reward, color='red', label=f'Moving Average (Window={window_size})', linewidth=2)
plt.title('Total Reward and Moving Average Over Episodes')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.legend()
plt.grid(True)
plt.show()

""" Plot loss per step """
window_size = 1000
ma_loss = np.convolve(loss_history, np.ones(window_size) / window_size, mode='valid')
plt.figure(figsize=(10, 6))
plt.plot(np.arange(len(loss_history)), loss_history, label='Loss per Step')
plt.plot(np.arange(window_size - 1, len(loss_history)), ma_loss, color='red', label=f'Moving Average (Window={window_size})', linewidth=2)
plt.title('Loss per Step')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.show()

""" Plot episode duration """
window_size = 50
ma_steps = np.convolve(len_episodes, np.ones(window_size) / window_size, mode='valid')
plt.figure(figsize=(10, 6))
plt.scatter(np.arange(len(len_episodes)), len_episodes)
plt.plot(np.arange(window_size - 1, max_episodes), ma_steps, color='red', label=f'Moving Average (Window={window_size})', linewidth=2)
plt.title('Steps per Episode and Moving Average Over Episodes')
plt.xlabel('Episode')
plt.ylabel('Total steps')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
env = gym.make('ShipQuest-v0', render_mode='human', prox_sensor=True)

for ep in range(5):
    state, info = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = select_action(state, policy_net, 0, action_dim)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        state = next_state
        total_reward += reward

    print('total reward: ' + str(total_reward))
env.close()

In [None]:
checkpoint = {
    'gamma' : gamma,
    'alpha' : alpha,
    'eps_max' : eps_max,
    'eps_min' : eps_min,
    'exploration_fraction' : exploration_fraction,
    'state_dim': state_dim,
    'action_dim': action_dim,
    'hidden_layer_dim' : hidden_layer_dim,
    'target_soft_update' : target_soft_update,
    'tau' : tau,
    'buffer_size' : buffer_size,
    'batch_size' : batch_size,
    'max_episodes' : max_episodes,
    'model_state_dict': policy_net.state_dict(),
    'total_rewards': total_rewards,
    'loss_history' : loss_history,
    'len_episodes' : len_episodes,
}

torch.save(checkpoint, 'model_with_data_10000_ep.pth')