## COMP 579 Final Project
- Ronald
- Sienna

### cpu / gpu count

In [None]:
# CPU cores
import multiprocessing
num_cores = multiprocessing.cpu_count()
print("Number of CPU cores: ",num_cores)

# CUDA cores
import torch
if not torch.cuda.is_available():
  print("CUDA is not available.")
else:
  device = torch.cuda.current_device()
  properties = torch.cuda.get_device_properties(device)
  cores = properties.multi_processor_count
  print(f"Number of CUDA devices: {torch.cuda.device_count()}")
  print(f"Number of CUDA cores: {cores}")

### install packages

In [None]:
# !pip install pettingzoo
# !pip install shimmy[openspiel]

# !pip install torchrl

### project directory

In [None]:
# Setting up project working directory
import os
# Changing project directory if working on remote machine
# PROJECT_DIRECTORY = "/home/ronald/579/final-project"
# os.chdir(PROJECT_DIRECTORY)
print(f"current working directory: {os.getcwd()}")
# ensure that the project directory is in the search path (for modules)
import sys
sys.path.append(os.getcwd())
print(f"added search path: {os.getcwd()}")

# Setting default directory for saving/loading data
DIRECTORY_NAME = "comp-579-final-project-data"
if not os.path.exists(DIRECTORY_NAME):
    os.makedirs(DIRECTORY_NAME)
    DIRECTORY_PATH = os.path.abspath(DIRECTORY_NAME)
else:
    DIRECTORY_PATH = os.path.abspath(DIRECTORY_NAME)
print(f"save/load data directory: {DIRECTORY_PATH}")

# This should be internal to run.py once run.load(data_id) is implemented
# Helper functions to save/load training data
import pickle
def save(object, filename):
    with open(filename, 'wb') as f:
        pickle.dump(object, f)
def load(filename):
    with open(filename, 'rb') as f:
        object = pickle.load(f)
    return object

### import classes and functions

In [None]:
from pettingzoo.classic.hanabi import hanabi
import torch.optim as optim

from network_models import DQN
from memory_models import ReplayMemory, PrioritizedReplayMemory
from agents import HumanAgent, RandomAgent, DQNAgent, DDQNAgent
import training_online as online
import training_offline as offline
import testing as test

# ignoring deprecation warnings
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)

### plotting functions

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# set up matplotlib
# is_ipython = 'inline' in matplotlib.get_backend()
# if is_ipython:
#     from IPython import display
# plt.ion()

def plot_trial(trial_output):
    fig, ax = plt.subplots(figsize=(15, 6))
    plt.style.use('seaborn-v0_8')

    ax.plot(trial_output)

    ax.set_xlabel("Episode", fontsize=14)
    ax.set_ylabel("Score", fontsize=14)
    ax.set_title(f"Score over 1 Trial", fontsize=16)
    # Set the major tick locators for y-axis to display only integers
    ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(1))

    plt.show()

def plot_trials(trials_output, curve_label='', data_id=None):
    trials_output = np.array(trials_output)
    num_trials = len(trials_output)

    # Compute statistics across trials
    mean_rewards = np.mean(trials_output, axis=0)
    std_rewards = np.std(trials_output, axis=0)

    # Plot mean rewards with shading for standard deviation
    plt.figure(figsize=(10, 6))
    plt.style.use('seaborn-v0_8')
    plt.plot(mean_rewards, label='')
    plt.fill_between(range(len(mean_rewards)), mean_rewards - std_rewards, mean_rewards + std_rewards, color='lightblue', alpha=0.3)
    plt.xlabel('Episode')
    plt.ylabel(f'Average Score over {num_trials} Trials')
    plt.title(curve_label)
    plt.legend()
    #plt.grid(True)
    if data_id is not None:
        plt.savefig(os.path.join(DIRECTORY_PATH, f'{data_id}.png'))
    plt.show()

def plot_trialss(trials_outputs, curve_labels):
    """
    Preconditions:
        - len(trials_outputs) == len(curve_labels)
    Parameters:
        trials_outputs: list of lists of floats
        curve_labels: list of strings
    """
    plt.figure(figsize=(10, 6))
    plt.style.use('seaborn-v0_8')
    for trials_output, curve_label in zip(trials_outputs, curve_labels):
        trials_output = np.array(trials_output)
        num_trials = len(trials_output)

        # Compute statistics across trials
        mean_rewards = np.mean(trials_output, axis=0)
        std_rewards = np.std(trials_output, axis=0)

        # Plot mean rewards with shading for standard deviation
        plt.plot(mean_rewards, label=curve_label)
        plt.fill_between(range(len(mean_rewards)), mean_rewards - std_rewards, mean_rewards + std_rewards, alpha=0.3)

    plt.xlabel('Episode')
    plt.ylabel('Score')
    plt.title(f'Performance of Different Agents')
    plt.legend()
    #plt.grid(True)
    plt.show()

def plot_agents(trials_output_dict, curve_label_dict):
    """
    Preconditions:
        - len(trials_output_dict) == len(curve_label_dict)
    Parameters:
        trials_output_dict: dictionary of lists of floats
        curve_label_dict: dictionary of strings
    """
    trials_outputs = trials_output_dict.values()
    curve_labels = curve_label_dict.values()
    plot_trialss(trials_outputs, curve_labels)

## DQN Training

In [None]:
# to record execution time
import time

# Storing training results for plotting
trials_output = {}
curve_label = {}

In [None]:

env_constructor = hanabi.env
env = env_constructor(render_mode="human", players=2)
env.reset()

N_ACTIONS = env.action_space('player_0').n # assuming same number of actions for all players
N_OBSERVATIONS = env.observation_vector_dim[0]

# Agent settings
HIDDEN_LAYERS = [512,512] # HIDDEN_LAYERS is the list of number of neurons for each hidden layer
policy_net = DQN(N_OBSERVATIONS, N_ACTIONS, HIDDEN_LAYERS)
target_net = DQN(N_OBSERVATIONS, N_ACTIONS, HIDDEN_LAYERS)
LR = 1e-4 # LR is the learning rate of the ``AdamW`` optimizer
optimizer = optim.AdamW(policy_net.parameters(), lr=LR)
memory = ReplayMemory(10000) # MEMORY_SIZE is the size of replay buffer
BATCH_SIZE = 256 # BATCH_SIZE is the number of transitions sampled from the replay buffer
GAMMA = 1
TAU = 0.005
EPS_START = 1
EPS_END = 0
EPS_DECAY = 10000 # decay "temperature"

agent1 = DQNAgent(policy_net, target_net, optimizer, memory, BATCH_SIZE, GAMMA, TAU, EPS_START, EPS_END, EPS_DECAY)
agent2 = DQNAgent(policy_net, target_net, optimizer, memory, BATCH_SIZE, GAMMA, TAU, EPS_START, EPS_END, EPS_DECAY)
agents = [agent1, agent2]

# trial setting
N_EPISODES = 5000
N_TRIALS = 1
RETURNS_OUTPUT_EPISODES = [1, 9999, 19999, 29999, 39999, 49999, 59999, 69999, 79999, 89999, N_EPISODES - 1] # manually select episodes to save returns
AGENTS_OUTPUT_EPISODES = [1, 9999, 19999, 29999, 39999, 49999, 59999, 69999, 79999, 89999, N_EPISODES - 1] # manually select episodes to save agents
DATA_ID = f"{N_TRIALS}-trials-{int(N_EPISODES/1000)}k__DQN" # id used for save/load filename


# CHOOSE WHAT TO RUN

start_time = time.time()
# run_episode(env, [agent1, agent2], "human")
# trial_output = offline.run_trial(env_constructor, agents, N_EPISODES, [], RETURNS_OUTPUT_EPISODES, AGENTS_OUTPUT_EPISODES, DIRECTORY_PATH, DATA_ID)
# trials_output[DATA_ID] = offline.run_trials_sequential(env_constructor, agents, N_TRIALS, N_EPISODES, RETURNS_OUTPUT_EPISODES, AGENTS_OUTPUT_EPISODES, DIRECTORY_PATH, DATA_ID, verbose=True)
trials_output[DATA_ID] = offline.run_trials_parallel(env_constructor, agents, N_TRIALS, N_EPISODES, RETURNS_OUTPUT_EPISODES, AGENTS_OUTPUT_EPISODES, DIRECTORY_PATH, DATA_ID, verbose=True)
end_time = time.time()

with open(f"{DIRECTORY_PATH}/{DATA_ID}__ep{N_EPISODES-1}.txt", "w") as f:
     print(f"Execution time: {end_time - start_time} seconds", file=f)

# Plotting
curve_label[DATA_ID] = DATA_ID
# concatenate data_id with number of episodes
filename__plot = f"{DATA_ID}__ep{N_EPISODES-1}"
plot_trials(trials_output[DATA_ID], curve_label[DATA_ID], filename__plot)

## Loading Agents

In [None]:
def DQN_load_agents(filename_player0, filename_player1):
    agent0 = DQNAgent.load(filename_player0)
    agent1 = DQNAgent.load(filename_player1)
    agent1.policy_net = agent0.policy_net
    agent1.target_net = agent0.target_net
    agent1.memory = agent0.memory
    return [agent0, agent1]
# def DDQN_load_agents...
# def PDDQN_load_agents...
# To do: generalize interface for loading any agents
# To do: training_offline.continue_run_trials_parallel(...)

filename_player0 = os.path.join(DIRECTORY_PATH, "8-trials-100k__DQN__1e-4__M10000__B256/DQN__1e-4__M10000B256__trial5__ep99999__player0.pt")
filename_player1 = os.path.join(DIRECTORY_PATH, "8-trials-100k__DQN__1e-4__M10000__B256/DQN__1e-4__M10000B256__trial5__ep99999__player0.pt")
loaded_agents = DQN_load_agents(filename_player0, filename_player1)
# agents = [RandomAgent(), RandomAgent()]

## Testing Loaded Agents

In [None]:
print("Agent 0 steps done: ", loaded_agents[0].steps_done)
print("Agent 1 steps done: ", loaded_agents[1].steps_done)
# print("Agent 0 policy_net:\n", agents[0].policy_net.state_dict())
# print("*******************************************************************")
# print("Agent 1 policy_net:\n", agents[1].policy_net.state_dict())

env_constructor = hanabi.env
env = env_constructor(render_mode="human", players=2)
env.reset()

# trial setting
N_EPISODES = 100
N_TRIALS = 1
RETURNS_OUTPUT_EPISODES = [] # manually select episodes to save returns
AGENTS_OUTPUT_EPISODES = [] # manually select episodes to save agents
DATA_ID = None # id used for save/load filename

# run testing
trial_output = test.run_trial(env_constructor, loaded_agents, N_EPISODES, [], RETURNS_OUTPUT_EPISODES, AGENTS_OUTPUT_EPISODES, DIRECTORY_PATH, DATA_ID, False)

# mean score
print("Mean score: ", np.mean(trial_output))

# Plotting
# plot_trial(trial_output)


## Comparing all Agents

In [None]:
# plot_agents(trials_output, curve_label)

## Continue Training with DQN Agents

In [None]:
# def DQN_load_agents(filename_player0, filename_player1):
#     agent0 = DQNAgent.load(filename_player0)
#     agent0 = DQNAgent.load(filename_player1)
#     agent1.policy_net = agent0.policy_net
#     agent1.target_net = agent0.target_net
#     return [agent0, agent1]
# def DDQN_load_agents...
# def PDDQN_load_agents...
# To do: generalize interface for loading any agents
# To do: training_offline.continue_run_trials_parallel(...)


## Loading Trial

In [None]:
# trial_loaded = load(os.path.join(DIRECTORY_PATH, f"DQN__LZY__2x512__1e-4__M10000B256__trial5__ep70000__trial_output.pkl"))
# curve_label_loaded = 'DQN__LZY__2x512__1e-4__M10000B256'
# plot_trial(trial_loaded)

## Loading Trials

In [None]:
# trials_loaded = []
# for trial in range(8):
#     trials_loaded.append(load(os.path.join(DIRECTORY_PATH, f"DQN__LZY__2x512__1e-4__M10000B256__trial{trial}__ep70000__trial_output.pkl")))
# curve_label_loaded = ['DQN Lazy 1e-4 M10000B256']
# plot_trials(trials_loaded, curve_label_loaded)