In [11]:
import os
import numpy as np
import pickle
from livelossplot import PlotLosses
from collections import deque
from envs.GraphNavEnv.graph_navigation_env import GraphNavEnv

from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
import random
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from envs.GraphEnv.impnode import ImpnodeEnv
import DQN_agent

In [14]:
from pytorchtools import EarlyStopping

In [15]:
# to initialize the replay buffer with some random interactions

def fill_memory(env, agent):
    for _ in range(NUM_MEM_FILL_EPS):
        N_STEP = 5
        state_history, action_history, reward_history = [], [], []
        done = False
        state, info = env.reset()

        while not done:
            action = env.action_space.sample(mask=info['node_action_mask']) # samples random action
            next_state, reward, done, truncated, info = env.step(action)
            state_history.append(state)
            action_history.append(action)
            reward_history.append(reward)

            if len(state_history) >= N_STEP:
                n_step_states = state_history[-N_STEP]
                n_step_actions = action_history[-N_STEP]
                n_step_rewards = reward_history[-N_STEP:]

                # Calculate n-step return
                n_step_return = sum(reward * (agent.discount ** i) for i, reward in enumerate(n_step_rewards))
                agent.memory.store(
                    state=n_step_states,
                    action=n_step_actions,
                    next_state=next_state,
                    reward=n_step_return,
                    done=done
                )
            state = next_state
            #agent.memory.store(state=state, action=action, next_state=next_state, reward=reward, done=done)

In [16]:
# trains the agent and plots the associated moving average rewards and epsilon values in real-time

def train_loop(env, agent, results_basepath):

    early_stopping = EarlyStopping(patience=20, verbose=True)

    liveloss = PlotLosses()
    logs = {}

    last_100_rewards = deque([], maxlen=50)

    reward_history = []
    epsilon_history = []

    step_cnt = 0
    best_score = -np.inf

    N_STEP = 5

    for ep_cnt in range(NUM_TRAIN_EPS):

        #logs['train epsilon'] = agent.epsilon # to plot current epsilon value
        state_history, action_history, reward_history = [], [], []
        done = False
        state, info = env.reset()
        ep_score = 0
        while not done:
            mask = info['node_action_mask']
            action = agent.select_action(state, mask)

            next_state, reward, done, truncated, info = env.step(action)

            state_history.append(state)
            action_history.append(action)
            reward_history.append(reward)

            if len(state_history) >= N_STEP:
                n_step_states = state_history[-N_STEP]
                n_step_actions = action_history[-N_STEP]
                n_step_rewards = reward_history[-N_STEP:]

                # Calculate n-step return
                n_step_return = sum(reward * (agent.discount ** i) for i, reward in enumerate(n_step_rewards))
                agent.memory.store(
                    state=n_step_states,
                    action=n_step_actions,
                    next_state=next_state,
                    reward=n_step_return,
                    done=done
                )
                agent.learn(BATCHSIZE)

                if step_cnt % UPDATE_FREQUENCY == 0:
                    agent.update_target_net()

                step_cnt += 1

            state = next_state
            ep_score += reward
            ##########
            #agent.memory.store(state=state, action=action, next_state=next_state, reward=reward, done=done)
            #agent.learn(BATCHSIZE)

            # if step_cnt % UPDATE_FREQUENCY == 0:
            #     agent.update_target_net()

            # state = next_state
            # ep_score += reward
            # step_cnt += 1

        agent.update_epsilon()

        last_100_rewards.append(ep_score)
        current_avg_score = np.mean(last_100_rewards) # get average of last 100 scores
        logs['train avg score'] = current_avg_score

        reward_history.append(ep_score)
        epsilon_history.append(agent.epsilon)

        if ep_cnt % 300 == 0:
            agent.data = True
            val_score_history = []
            for ep in range(100):
                ep_score = 0
                done = False
                state, info = env.reset(ep)
                while not done:
                    mask = info['node_action_mask']
                    action = agent.select_action(state, mask)
                    next_state, reward, done, truncated, _ = env.step(action)
                    ep_score += reward
                    state = next_state

                # track reward history only while running locally

                val_score_history.append(ep_score)
                agent.data = False
            val_score = np.average(val_score_history)

            early_stopping(val_score, agent)
            logs['val avg score'] = val_score
            if early_stopping.early_stop:
                print("Early stopping")
                break

            if val_score >= best_score:
                #agent.save_model('{}/dqn_model'.format(results_basepath))
                agent.save_model('{}/model.pt'.format(results_basepath))
                best_score = val_score

        # update the plots in real-time
        liveloss.update(logs)
        liveloss.send()

    # store the reward and epsilon history that was tracked while running locally

    with open('{}/train_reward_history.pkl'.format(results_basepath), 'wb') as f:
        pickle.dump(reward_history, f)

    with open('{}/train_epsilon_history.pkl'.format(results_basepath), 'wb') as f:
        pickle.dump(epsilon_history, f)

In [17]:
# variables for training the agent

NUM_TRAIN_EPS = 10 #1000 number training episodes to run
NUM_MEM_FILL_EPS = 10 #10 number of episodes to run to initialize the memory

DISCOUNT = 0.99 # gamma used for computing return

BATCHSIZE = 64 # number of transitions to sample from replay buffer for each learn step
MEMORY_CAPACITY = 50 # size of the memory buffer
UPDATE_FREQUENCY = 10 # number of interactions after which the target buffer is updated

EPS_MAX = 1.0 # initial epsilon value
EPS_MIN = 0.05 # final epsilon value
EPS_STEP = 100 # amount by which epsilon is decayed at each episode

LR = 0.01 # learning rate for the network

In [18]:
# create folder for storing the model and other files
# graph_size, layer_iterations, embedding_dim, NUM_TRAIN_EPS,NUM_MEM_FILL_EPS,DISCOUNT,BATCHSIZE,MEMORY_CAPACITY, UPDATE_FREQUENCY,EPS_MAX,EPS_MIN, EPS_STEP,LR
results_basepath_train = "results/trial_model_newest".format(
                        NUM_TRAIN_EPS,NUM_MEM_FILL_EPS,DISCOUNT,BATCHSIZE,MEMORY_CAPACITY, UPDATE_FREQUENCY,EPS_MAX,EPS_MIN, EPS_STEP,LR)
os.makedirs(results_basepath_train, exist_ok=True)

subdir = 'data/30-50'
data_path = Path.cwd()/subdir

seed = None
#env_train = ImpnodeEnv(anc='dw_nd', ba_nodes=(15, 25), ba_edges = 4,max_removed_nodes = 10, seed=seed, render_option=False, data= False,data_path=data_path, train_mode=True)
env_train = GraphNavEnv(fix_random_graphs=True)


In [21]:
# create the dqn_agent
dqn_agent_train = DQN_agent.DQNAgent(device,
                                     #env_train.observation_space.shape[0],
                                     2,#5,
                                     env_train.action_space.n,
                                     discount=DISCOUNT,
                                     eps_max=EPS_MAX,
                                     eps_min=EPS_MIN,
                                     eps_step=EPS_STEP,
                                     memory_capacity=MEMORY_CAPACITY,
                                     lr=LR,
                                     train_mode=True)



In [22]:
# initialise the memory
fill_memory(env_train, dqn_agent_train)

# train the agent
train_loop(env_train, dqn_agent_train, results_basepath_train)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (35x5 and 2x46)

In [None]:
# tests the agent through interactions with the environment and plots the associated rewards in real-time

def test_loop(env, agent, results_basepath):
    liveloss = PlotLosses()
    logs = {}

    reward_history = []
    actions = []
    for ep in range(NUM_TEST_EPS):
        ep_score = 0
        done = False
        state, info = env.reset(ep)
        while not done:
            mask = info['node_actio n_mask']
            action = agent.select_action(state, mask)
            actions.append(action)
            print(action)
            next_state, reward, done, truncated, _ = env.step(action)
            ep_score += reward
            state = next_state

        # track reward history only while running locally

            reward_history.append(reward)

        # update the plot in real-time
            logs['test score'] = reward
            liveloss.update(logs)
            liveloss.send()
    return actions

    # store the reward that was tracked while running locally

    with open('{}/test_reward_history.pkl'.format(results_basepath), 'wb') as f:
        pickle.dump(reward_history, f)

In [None]:
# variables for testing the agent
# location where the model is stored and the name of the associated environment

RESULTS_BASEPATH_TEST = 'results/new_30-50_traineps50000_epsmax1.0_epsmin0.05_epsstep10000_batchsize64_treps50000_memeps1000_memcap500000_gseedFalse'

NUM_TEST_EPS = 1 # number of test episodes to run

In [None]:
# create test environment and set associated seed

seed = 1
env_test = ImpnodeEnv(ba_nodes=random.randint(15, 25) * 2 ,ba_edges = 4,max_removed_nodes = 3, seed=seed, render_option=False, data=False, train_mode=False)

# create the dqn agent with the stored weights
dqn_agent_test = DQN_agent.DQNAgent(device=device,
                          state_size=5,#env_test.observation_space.shape[0],
                          action_size=env_test.action_space.n,
                          discount=0.0,
                          eps_max=0.0,
                          eps_min=0.0,
                          eps_step=0.0,
                          memory_capacity=0,
                          lr=0,
                          train_mode=False)
dqn_agent_test.load_model('{}/dqn_model'.format(RESULTS_BASEPATH_TEST))



In [None]:
# test the agent
actions = test_loop(env=env_test,
          agent=dqn_agent_test,
          results_basepath=RESULTS_BASEPATH_TEST)

In [None]:
actions