<a href="https://colab.research.google.com/github/speedhawk/LLM-A-/blob/main/PPO_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import copy
import re
import openpyxl
import random

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler, SequentialSampler
from torch.distributions import Categorical

In [None]:
from collections import deque
from collections import namedtuple

In [None]:
Transition = namedtuple('Transition', ['state', 'action', 'pre_prob', 'reward', 'next_state'])

In [None]:
# AC_PPO_ini Model
class Critic(nn.Module):
  def __init__(self, input):
      super(Critic, self).__init__()
      self.dim = 128
      self.fc1 = nn.Linear(input, self.dim)
      self.fc2 = nn.Linear(self.dim, self.dim)
      self.state_value = nn.Linear(self.dim, 1)

      # layer normalization
      self.bn1 = nn.LayerNorm(self.dim)
      self.bn2 = nn.LayerNorm(self.dim)

      self.initialization()

  def initialization(self):   # weight initialization
    nn.init.xavier_uniform_(self.fc1.weight, gain=nn.init.calculate_gain('tanh'))
    nn.init.xavier_uniform_(self.fc2.weight, gain=nn.init.calculate_gain('tanh'))

  def forward(self, x):
      x = F.tanh(self.fc1(x))
      x = self.bn1(x)
      x = F.tanh(self.fc2(x))
      x = self.bn2(x)
      value = self.state_value(x)
      return value

class Actor(nn.Module):
  """
  INPUT: the position as the state of agent
  OUPPUT: 8-dim Teansor, the probability of each actions
  """
  def __init__(self, input, output):
      super(Actor, self).__init__()
      self.dim = 128
      self.fc1 = nn.Linear(input, self.dim)
      self.fc2 = nn.Linear(self.dim, self.dim)
      self.action_head = nn.Linear(self.dim, output)

      # layer normalization
      self.bn1 = nn.LayerNorm(self.dim)
      self.bn2 = nn.LayerNorm(self.dim)

      # # noise generator
      # self.mu = 0.0
      # self.stdv = 0.1


      self.initialization()

  def initialization(self):
    nn.init.xavier_uniform_(self.fc1.weight, gain=nn.init.calculate_gain('tanh'))
    nn.init.xavier_uniform_(self.fc2.weight, gain=nn.init.calculate_gain('tanh'))

  def noise_generate(self, actions):
    noise = torch.normal(mean=self.mu, std=self.stdv, size=actions.size())
    return noise

  def forward(self, x):
      x = F.tanh(self.fc1(x))
      x = self.bn1(x)
      x = F.tanh(self.fc2(x))
      x = self.bn2(x)
      x = self.action_head(x)
      # x = x + self.noise_generate(x).to('cpu')
      action_prob = F.softmax(x, dim=1)
      return action_prob

In [None]:
# PPO agent
class PPO():
  def __init__(self, path, start, goal, map_range, border, map_sheet='Sheet1', reward_sheet='Sheet2') -> None:

    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # environment
    self.start_node = np.array(start)
    self.target_node = np.array(goal)
    self.cur_node = self.start_node
    self.pre_node = self.start_node

    self.file_path = path
    self.map_range = map_range
    self.map_sheet = map_sheet
    self.reward_sheet = reward_sheet
    self.map = None
    self.reward_distribution = None

    # map scope
    self.max_border = border
    self.x_max = None
    self.x_min = None
    self.y_max = None
    self.y_min = None

    self.action_space = {0: [1, 0],
                           1: [-1, 0],
                           2: [0, 1],
                           3: [0, -1],
                           4: [-1, 1],
                           5: [1, 1],
                           6: [-1, -1],
                           7: [1, -1]}

    # learning param:

    self.memory = {}  # this list consists of transations for the number of n times of batch_size.
    self.memory_len = 50
    self.batch_size = 128
    self.actor_max_grad = 0.05
    # self.critic_max_grad = 1.5
    self.actor = Actor(2 * border + 8, 8).to(self.device)
    self.critic = Critic(2 * border + 8).to(self.device)
    self.gamma = 0.99
    self.lamda = 0.95
    self.a_lr = 2e-4
    self.c_lr = 5e-3
    self.clip = 0.3
    self.entropy_coe = 0.01
    # self.l1_co = 0.001
    # self.l2_co = 0.005

    self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.a_lr)
    self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.c_lr)

    # iteration param:

    self.pre_reward_sign = float(0)
    self.sign_record = 0
    self.pre_unreshaped_reward = 0.0

    self.step_thr = 200
    self.steps = 0

    # statistic param:
    self.avg = []
    self.total_avg = []
    self.avg_step = []
    self.total_step = []


  def generate_map(self):
    wb = openpyxl.load_workbook(self.file_path)
    ws = wb[self.map_sheet]
    _range = self.map_range
    map = []
    for row in ws[_range]:
      map_row = []
      for cell in row:
        map_row.append(cell.value)
      map.append(map_row)
    map = np.array(map)
    self.map = map

  def generate_dis_table(self):
    wb = openpyxl.load_workbook(self.file_path)
    ws = wb[self.reward_sheet]
    _range = self.map_range
    dis_table = []
    for row in ws[_range]:
      dis_table_row = []
      for cell in row:
        dis_table_row.append(cell.value)
      dis_table.append(dis_table_row)
    dis_table = np.array(dis_table)
    self.reward_distribution = dis_table

  def save_checkpoints(self):
    torch.save(self.actor.state_dict(), '/content/gdrive/MyDrive/PPO_demo_v4/checkpoints/checkpoints_actor.pt')
    torch.save(self.critic.state_dict(), '/content/gdrive/MyDrive/PPO_demo_v4/checkpoints/checkpoints_critic.pt')

  def load_checkpoints(self):
    self.actor.load_state_dict(torch.load('/content/gdrive/MyDrive/PPO_demo_v4/checkpoints/checkpoints_actor.pt', map_location=torch.device('cpu')))
    self.critic.load_state_dict(torch.load('/content/gdrive/MyDrive/PPO_demo_v4/checkpoints/checkpoints_critic.pt', map_location=torch.device('cpu')))

    # self.actor.load_state_dict(torch.load('/content/gdrive/MyDrive/PPO_demo_v4/checkpoints/checkpoints_actor.pt'))
    # self.critic.load_state_dict(torch.load('/content/gdrive/MyDrive/PPO_demo_v4/checkpoints/checkpoints_critic.pt'))

  def save_trans(self, index, trans):
    self.memory[index].insert(0, trans)

  def vectorized_start(self, episode):


    self.x_min = 0
    self.x_max = self.max_border - 1
    self.y_min = 0
    self.y_max = self.max_border - 1

    x = random.randint(self.x_min, self.x_max)
    y = random.randint(self.y_min, self.y_max)

    while self.reward_distribution[x][y] == -1024:
      x = random.randint(self.x_min, self.x_max)
      y = random.randint(self.y_min, self.y_max)
    return np.array([x, y])

  def reset(self):
    self.cur_node = self.start_node

  def get_noise(self, actions):
    sigma = 0.1
    mu = 0.0
    noise_vector = torch.normal(mean=mu, std=sigma, size=actions.size())

  def double_hot(self, obs):
    double_hot_state = [0] * (self.max_border * 2)
    x = obs.tolist()[0]
    y = obs.tolist()[1]
    double_hot_state[x] = 1
    double_hot_state[self.max_border+y] = 1
    return double_hot_state

  def situation(self, obs):
    situation = []
    is_obstacle = 0
    for i in self.action_space:
      sur_obs = (obs + np.array(self.action_space[i])).tolist()
      x = sur_obs[0]
      y = sur_obs[1]
      if x < self.x_min or x > self.x_max or y < self.y_min or y > self.y_min or self.reward_distribution[x][y] == -1024:
        is_obstacle = 0
      else:
        is_obstacle = 1
      situation.append(is_obstacle)

    return situation

  def get_next_obs(self, action):

    obs = self.cur_node + np.array(self.action_space[action])
    collide_check = 0

    obs = obs.tolist()
    x = obs[0]
    y = obs[1]

    if x > self.x_max or x < self.x_min or y > self.y_max or y < self.y_min or self.reward_distribution[x][y] == -1024:
      obs = self.cur_node
      collide_check = 1

    return (np.array(obs), collide_check)

  def action_select(self, obs):
    possibilities = self.actor(obs).cpu().detach()
    # print(f"p: {possibilities}")

    distrb = Categorical(possibilities)   # attention: Categorical requires a tensor input!
    act = distrb.sample().item()

    possibilities = possibilities.numpy()
    posb = possibilities.item(act)

    return act, posb

  def get_reward(self, obs):

    if obs[1] == 1:
      # self.pre_unreshaped_reward = 0.0
      return 0.0
    else:
      obs = obs[0]
      x1 = obs[0]
      y1 = obs[1]

      cur_node = self.cur_node.tolist()
      x2 = cur_node[0]
      y2 = cur_node[1]

      r = float(self.reward_distribution[x1][y1]-self.reward_distribution[x2][y2])

      return r

  def reward_reshaping(self, reward):
    """
    Except for reshaping the reward, there are two variables to be sustained in this
    function:
      1. pre_reward_sign: record the last reward sign, used for evaluate sign_record
      2. sign_record: record the frequence of nice actions.
    """

    # reward reshaping
    r_sign = float(np.sign(reward))
    if r_sign == self.pre_reward_sign and r_sign == 1.0:
      self.sign_record += 1
    else:
      self.sign_record = 0

    index = 0.1 * (float(self.sign_record))
    reward += index
    self.pre_reward_sign = r_sign

    return reward

  def is_done(self, reward):

    if reward == -10.0:
      return True

    return False

  def step(self, act):

    obs = self.get_next_obs(act)
    reward = self.get_reward(obs)
    reward = self.reward_reshaping(reward)
    done = self.is_done(reward)
    inf = 'the current step is ' + str(done)
    return obs, reward, done, inf

  def critic_train(self):
    Transition = namedtuple('Transition', ['state', 'action', 'pre_prob', 'reward', 'next_state', 'G'])

    datas = []

    for i in range(len(self.memory)):
      G = 0.0
      for tran in self.memory[i]:
        G = tran.reward + self.gamma * G
        tran_list = list(tran)
        tran = Transition(tran_list[0], tran_list[1], tran_list[2], tran_list[3], tran_list[4], G)
        datas.append(tran)

    random.shuffle(datas)

    states = torch.tensor([trans.state.tolist() for trans in datas], dtype=torch.float).to(self.device)
    actions = torch.tensor([trans.action for trans in datas], dtype=torch.long).view(-1, 1).to(self.device)
    # actions size: (batch_size, 1)
    G = torch.tensor([trans.G for trans in datas], dtype=torch.float).view(-1, 1).to(self.device)
    # G size: (batch_size, 1)

    for _ in range(2 * len(datas) // self.batch_size):

      for index in BatchSampler(SequentialSampler(range(len(datas))), batch_size=self.batch_size, drop_last=False):

        v_s = self.critic(states[index]).squeeze(-1)


        critic_loss = F.mse_loss(G[index], v_s)

        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

  def actor_train(self):

    Transition = namedtuple('Transition', ['state', 'action', 'pre_prob', 'reward', 'next_state', 'adv'])

    datas = []

    for i in range(len(self.memory)):
      adv = 0.0
      for tran in self.memory[i]:
        Q = tran.reward + self.gamma * self.critic(tran.next_state.to(self.device)).detach().item() # Q = r + V(s')
        v_s = self.critic(tran.state.to(self.device)).detach().item()
        delta = Q - v_s
        adv = delta + self.gamma * self.lamda * adv
        tran_list = list(tran)
        tran = Transition(tran_list[0], tran_list[1], tran_list[2], tran_list[3], tran_list[4], adv)
        datas.append(tran)

    states = torch.tensor([trans.state.tolist() for trans in datas], dtype=torch.float).to(self.device)
    actions = torch.tensor([trans.action for trans in datas], dtype=torch.long).view(-1, 1).to(self.device)
    # actions size: (batch_size, 1)
    advs = torch.tensor([trans.adv for trans in datas], dtype=torch.float).view(-1, 1).to(self.device)
    # advs size: (batch_size, 1)

    pre_probs= torch.tensor([trans.pre_prob for trans in datas], dtype=torch.float).view(-1, 1).to(self.device)


    for _ in range(2 * len(datas) // self.batch_size):

      random.shuffle(datas)

      for index in BatchSampler(SequentialSampler(range(len(datas))), batch_size=self.batch_size, drop_last=False):

        # Obtain pre_action_probability and cur_action_probability, which is used for calculating ratio and clamped ratio

        p = self.actor(states[index].squeeze(1)).squeeze(1).to(device)

        cur_probs = p.gather(1, actions[index])
        entropies = torch.tensor([Categorical(posb.unsqueeze(0)).entropy().item() for posb in p], dtype=torch.float).view(-1, 1).to(self.device)

        ratio = cur_probs / pre_probs[index]
        n_advs = (advs[index]-advs[index].mean()) / advs[index].std()
        sel_a = ratio * n_advs
        sel_b = torch.clamp(ratio, 1-self.clip, 1+self.clip) * n_advs

        # Update Actor by minimum ratio mean loss

        actor_loss = -torch.min(sel_a, sel_b).mean() - self.entropy_coe * entropies.mean() # entropy regularization loss function
        # print(f"a_loss: {actor_loss}")
        self.actor_opt.zero_grad()
        actor_loss.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(), self.actor_max_grad)
        self.actor_opt.step()

# for the purpose of continuity of training, the four functions below is created in case of offline problem so that the data will not be lost after re-connecting.

  def load_episodes(self):
    file_iter = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/episodes.txt', 'r')
    episode = file_iter.read()
    file_iter.close()
    e = int(episode)
    return e

  def save_episodes(self):
    e = len(self.avg)
    file_iter = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/episodes.txt', 'w')
    file_iter.write(str(e))
    file_iter.close()

  def load_records(self):
    file_avg_score = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/avg_scores.txt', 'r')
    file_avg_step = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/avg_steps.txt', 'r')
    file_score = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/scores.txt', 'r')
    file_step = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/steps.txt', 'r')
    while True:
      a_score = file_avg_score.readline()
      a_step = file_avg_step.readline()
      score = file_score.readline()
      step = file_step.readline()
      if not a_score:
          break
      self.avg.append(eval(a_score))
      self.avg_step.append(eval(a_step))
      self.total_avg.append(eval(score))
      self.total_step.append(eval(step))
    file_avg_score.close()
    file_avg_step.close()
    file_score.close()
    file_step.close()

  def save_records(self):
    upper = len(self.avg)
    lower = upper - (self.memory_len - 1)
    a_scores = self.avg[-self.memory_len:]
    a_steps = self.avg_step[-self.memory_len:]
    scores = self.total_avg[-self.memory_len:]
    steps = self.total_step[-self.memory_len:]
    file_avg_score = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/avg_scores.txt', 'a')
    file_avg_score.writelines([str(a_sc)+'\n' for a_sc in a_scores])
    file_avg_score.close

    file_avg_step = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/avg_steps.txt', 'a')
    file_avg_step.writelines([str(a_st)+'\n' for a_st in a_steps])
    file_avg_step.close

    file_score = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/scores.txt', 'a')
    file_score.writelines([str(sc)+'\n' for sc in scores])
    file_score.close

    file_step = open('/content/gdrive/MyDrive/PPO_demo_v4/configuration/steps.txt', 'a')
    file_step.writelines([str(st)+'\n' for st in steps])
    file_step.close


**Select an environment**

In [None]:
from IPython.core.magics.script import default

map_name = 'Aisle_24'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [1, 22]
goal = [21, 2]
map_range = 'A1:X24'
border = 24
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
from IPython.core.magics.script import default

map_name = 'Canyon_24'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [2, 17]
goal = [15, 2]
map_range = 'A1:X24'
border = 24
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
from IPython.core.magics.script import default

map_name = 'Double_door_24'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [1, 22]
goal = [19, 1]
map_range = 'A1:X24'
border = 24
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
from IPython.core.magics.script import default

map_name = 'Double_door'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [1, 29]
goal = [28, 1]
map_range = 'A1:AF32'
border = 32
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
from IPython.core.magics.script import default

map_name = 'Canyon'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [3, 23]
goal = [21, 1]
map_range = 'A1:AF32'
border = 32
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
from IPython.core.magics.script import default

map_name = 'Aisle'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [1, 30]
goal = [29, 2]
map_range = 'A1:AF32'
border = 32
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
from IPython.core.magics.script import default

map_name = 'Double_door_16'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [1, 15]
goal = [14, 1]
map_range = 'A1:P16'
border = 16
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
from IPython.core.magics.script import default

map_name = 'Canyon_16'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [2, 10]
goal = [11, 1]
map_range = 'A1:P16'
border = 16
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
from IPython.core.magics.script import default

map_name = 'Aisle_16'
path = '/content/gdrive/MyDrive/PPO_demo_v4/maps/' + map_name + '.xlsx'
start = [1, 13]
goal = [14, 1]
map_range = 'A1:P16'
border = 16
agent = PPO(path, start, goal, map_range, border)
device = agent.device

agent.generate_map()
agent.generate_dis_table()

In [None]:
# print(path)

/content/gdrive/MyDrive/PPO_demo_v4/maps/Double_door_24.xlsx


***Here, there is commonly not necessary to conduct a long-time trainning that is possibly be interrupted. However, the code below can be run before training in case of sudden-offline problem. Please create a folder to make sure you can contain these backup files. ***

In [None]:
# if delete existed records, please run this
!rm /content/gdrive/MyDrive/PPO_demo_v4/configuration/avg_scores.txt
!rm /content/gdrive/MyDrive/PPO_demo_v4/configuration/avg_steps.txt
!rm /content/gdrive/MyDrive/PPO_demo_v4/configuration/episodes.txt
!rm /content/gdrive/MyDrive/PPO_demo_v4/configuration/scores.txt
!rm /content/gdrive/MyDrive/PPO_demo_v4/configuration/steps.txt

In [None]:
# if state a new train without existed records, please run this
!touch /content/gdrive/MyDrive/PPO_demo_v4/configuration/avg_scores.txt
!touch /content/gdrive/MyDrive/PPO_demo_v4/configuration/avg_steps.txt
!touch /content/gdrive/MyDrive/PPO_demo_v4/configuration/episodes.txt
!touch /content/gdrive/MyDrive/PPO_demo_v4/configuration/scores.txt
!touch /content/gdrive/MyDrive/PPO_demo_v4/configuration/steps.txt

In [None]:
e = agent.load_episodes() # if the episodes.txt is newly created, please open it and input a '0' in the file.
agent.load_records()

steps = agent.total_step[-1] if len(agent.total_step) != 0 else 0
total_avg = agent.total_avg[-1] if len(agent.total_avg) != 0 else 0


In [None]:
print(steps)
print(total_avg)
print(e)

In [None]:
# be careful! If a train process is interrupted (print(e) != 0), please remember to run this code if continue!
agent.load_checkpoints()

**Training**

In [None]:
episodes = 2000
# dr = 0.5

# agent.load_checkpoints()

default_is_solved = False

default_mode = 'Train'

path_nodes = [agent.start_node.tolist()]
searched_space = []

agent.actor.train()
agent.critic.train()


for e in range(e, episodes):

  agent.start_node = agent.vectorized_start(e)
  agent.reset()
  total_step = 0
  score = 0.0
  done_time = 0

  index = e % agent.memory_len
  agent.memory[index] = []


  while list(agent.cur_node) != list(agent.target_node) and total_step < agent.step_thr:

    obs = np.array(agent.double_hot(agent.cur_node) + agent.situation(agent.cur_node))
    obs = torch.from_numpy(obs).type(torch.FloatTensor).unsqueeze(0).to(device)

    # print(f"state_obs_size: {obs.size()}")

    act, posb = agent.action_select(obs)

    # print(possibilities)

    # v_s_real = 0
    # for i in range(0, 9):
    #   if i == 0:
    #     next_s = obs + torch.tensor([0, 0])
    #   else:
    #     next_s = obs + torch.tensor(agent.action_space[i-1])
    #   if possibilities[i] == 0.0:
    #     v_s_real += 0.0
    #   else:
    #     v_s_real += possibilities[i] * (agent.get_reward(i) + agent.gamma * agent.critic(next_s))

    next_obs, reward, done, _ = agent.step(act)
    n_o = np.array(agent.double_hot(next_obs[0])+agent.situation(next_obs[0]))
    n_o = torch.from_numpy(n_o).type(torch.FloatTensor).unsqueeze(0).to(device)

    # trans = Transition(obs, act, posb, entropy, reward, n_o, agent.x_max, agent.x_min, agent.y_max, agent.y_min)
    trans = Transition(obs, act, posb, reward, n_o)
    agent.save_trans(index, trans)
    score += reward

  # done processing 1

    if done == True:
      done_time += 1
    else:
      agent.pre_node = agent.cur_node
      agent.cur_node = next_obs[0]

    # done processing 2

    # if done == True:
    #   break

    # agent.pre_node = agent.cur_node
    # agent.cur_node = next_obs

    total_step += 1

    if agent.cur_node.tolist() not in searched_space:
      searched_space.append(agent.cur_node.tolist())

  # record the total scores and avg-scores of current episodes


  avg = score / (total_step+1)
  total_avg += avg
  agent.total_avg.append(total_avg)

  avg_score = total_avg / (e+1)
  agent.avg.append(avg_score)

  # record the total steps and avg-steps of current episodes
  steps += total_step
  agent.total_step.append(steps)

  avg_step = steps / (e+1)
  agent.avg_step.append(avg_step)

  if (e+1) % agent.memory_len == 0:

    # agent.train_process()

    agent.critic_train()
    agent.actor_train()

    agent.save_episodes()
    agent.save_records()

  # # update learning rate
  if e >= 500 and agent.a_lr > 0.0 and agent.a_lr - ((2e-4) / 1500) >= 0:
    agent.entropy_coe -= 1.5e-5
    agent.a_lr -= ((2e-4) / 1500)
    # agent.c_lr -= (agent.c_lr / 10000)

  # save checkpoints
  agent.save_checkpoints()

  # output records of each episodes
  print(f"episode {e+1}: start node: {agent.start_node.tolist()}, score: {round(score/(total_step+1), 2)}, avg score: {round(agent.avg[e], 2)}, step: {total_step}, avg step: {round(avg_step, 2)}, done_time: {done_time}")

In [None]:
avg_scores = agent.avg
avg_steps = agent.avg_step

In [None]:
episodes = list(i for i in range(0, len(avg_steps)))

**Generate statistic images**

In [None]:
avg_score = agent.avg
l1 = plt.plot(episodes, avg_scores, 'r--')
plt.title('Score')
plt.xlabel('episodes')
plt.ylabel('average score')
plt.legend()
plt.show()

In [None]:
avg_step = agent.avg_step
l2 = plt.plot(episodes, avg_steps, 'b--')
plt.title('Steps')
plt.xlabel('episodes')
plt.ylabel('average steps')
plt.legend()
plt.show()

**Generate result images**

In [None]:
# Test process

default_mode = 'Test'
agent.load_checkpoints()
agent.actor.eval()
agent.critic.eval()
path_nodes = [start]
test_steps = 0
agent.start_node = np.array(start)
agent.target_node = np.array(goal)
agent.cur_node = agent.start_node

done_time = 0

while list(agent.cur_node) != list(agent.target_node):
  obs = np.array(agent.double_hot(agent.cur_node) + agent.situation(agent.cur_node))
  obs = torch.from_numpy(obs).type(torch.FloatTensor).unsqueeze(0).to(device)
  act, posb = agent.action_select(obs)

  next_obs, reward, done, _ = agent.step(act)

  if done == True:
      done_time += 1
  else:

    agent.pre_node = agent.cur_node
    agent.cur_node = next_obs[0]
    if any(node==agent.cur_node.tolist() for node in path_nodes) == False:
      path_nodes.append(agent.cur_node.tolist())

  test_steps += 1
print(f"Total step: {test_steps}")

In [None]:
# generate pic (beta)

map_r = copy.deepcopy(agent.map)
map_g = copy.deepcopy(map_r)
map_b = copy.deepcopy(map_g)
map = np.dstack((map_r, map_g, map_b)).astype(float)

# generate searched nodes

for node in searched_space:
  x = node[0]
  y = node[1]

  map[x][y][0] = 0.0
  map[x][y][1] = 1.0
  map[x][y][2] = 0.0

#

delta_color = round(1.0 / len(path_nodes), 4)
color_0 = 0.0
color_2 = 1.0
bas_dir_vector = np.array([0.0, 1.0])
disp_vector = np.array(goal) - np.array(start)
sum_angle = 0.0
avg_angle = 0.0
count = 0
max_dev_times = 0

for i in range(len(path_nodes)):

  #  color it
  node = path_nodes[i]

  x = node[0]
  y = node[1]

  map[x][y][0] = color_0
  map[x][y][1] = 0.0
  map[x][y][2] = color_2

  color_0 += delta_color
  color_2 -= delta_color

  if i == 0:
    continue

  count += 1
  c_node = path_nodes[i]
  p_node = path_nodes[i-1]

  # calculate steeling angle and :
  dir_vector = np.array(c_node) - np.array(p_node)
  angle = abs(np.arccos(np.clip(np.dot(dir_vector, bas_dir_vector)/(np.linalg.norm(dir_vector)*np.linalg.norm(bas_dir_vector)), -1.0, 1.0)))
  dis_angle = abs(np.arccos(np.clip(np.dot(dir_vector, disp_vector)/(np.linalg.norm(dir_vector)*np.linalg.norm(disp_vector)), -1.0, 1.0)))
  angle = round(angle, 3)
  dis_angle = round(dis_angle, 3)

  sum_angle += angle
  avg_angle = round(sum_angle / float(count), 2)

  bas_dir_vector = dir_vector

  if dis_angle > math.pi / 2.0:
    max_dev_times += 1


# s_x = start[0]
# s_y = start[1]
# g_x = goal[0]
# g_y = goal[1]

# map[s_x][s_y][0] = 1.0
# map[s_x][s_y][1] = 0.0
# map[s_x][s_y][2] = 1.0
# map[g_x][g_y][0] = 1.0
# map[g_x][g_y][1] = 1.0
# map[g_x][g_y][2] = 0.0


path_map = (map * 255.0).astype(int)

In [None]:
fig, axes = plt.subplots(1, 1, figsize=[12, 4])
axes.imshow(path_map)
axes.axis('off')