In [None]:
# Import libraries
from mlagents_envs.environment import UnityEnvironment
from torchsummary import summary
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time

In [None]:
# Check wheteher GPU is being used and select GPU
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device= torch.device("cpu")
    print(device)

# Define NN

In [None]:
# Delta X, Delta Z, Theta, 5 range sensors
N_STATES = 8
# Move forward, turn right, turn left
N_ACTIONS = 3   

In [None]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(N_STATES, 50)
        self.layer1.weight.data.normal_(0, 0.2)
        self.layer2 = nn.Linear(50, 25)
        self.layer2.weight.data.normal_(0, 0.2)
        self.layer3 = nn.Linear(25, 20)
        self.layer3.weight.data.normal_(0, 0.2)
        self.out = nn.Linear(20, N_ACTIONS)
        self.out.weight.data.normal_(0, 0.2)

    def forward(self, x):
        x = torch.sigmoid(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        x = self.out(x)
        return F.softmax(x, dim=1)

In [None]:
# Evaluation NN
eval_net = Net().to(device)
print(eval_net)

In [None]:
# Adam optimizer and MSE as loss function
optimizer = torch.optim.Adam(eval_net.parameters(), lr=0.001)
loss_func = nn.MSELoss()
print(optimizer)
print(loss_func)

In [None]:
# Q-Target NN
target_net = Net().to(device)
print(target_net)

# Define replay buffer

In [None]:
# Replay memory definition (States, actions, reward, future states)
MEMORY_CAPACITY = 10000
MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))  
print(MEMORY.shape)

# Connect to Unity VE 

In [None]:
# Connect with Unity. Sometimes, Ubuntu doesn't close the port immediately and throws exception
n = True
counter = 0
while(n):
    try:
        env = UnityEnvironment(file_name=None, base_port=5004)
        env.reset()
        group_name = env.get_agent_groups()[0] 
        print('Connected...')
        n = False
    except:
        counter += 1
        time.sleep(1)
        if (counter > 11):
            print('Connection failed...')
            n = False
        pass

In [None]:
# Get the agent groups in the simulation environment and use first agent group and specifications
group_name = env.get_agent_groups()[0]
group_spec = env.get_agent_group_spec(group_name)
print(group_name)
print(group_spec)

# NN interacts with Unity VE to learn 

### interact BATCH_SIZE to fill the memory with BATCH_SIZE

In [None]:
# Transfer learning. Used to restart training
eval_net.load_state_dict(torch.load('dqn_a.dat'))
print(eval_net)

In [None]:
# Set up mini-batch size
BATCH_SIZE = 128

In [None]:
# Check whether the VE is communicating with Python correctly
step_result = env.get_step_result(group_name)
s = torch.FloatTensor(step_result.obs[0]).to(device)
sAgentNo = step_result.n_agents()
action = eval_net(s)
MaxIdxOfEachAgent = torch.unsqueeze(torch.max(action, 1)[1], 1)
ActionIdxArray = MaxIdxOfEachAgent.cpu().data.numpy()
print('Delta X:', s[0][0], '/ Delta Z:', s[0][1], '/ Facing angle:', s[0][2])
print('Agent:', sAgentNo)
print('NN output:', action)
print('Action to take:', ActionIdxArray)
print('Current reward:', step_result.reward)

In [None]:
# Fill replay memory
MemoryIdx = 0
step_result = env.get_step_result(group_name)
s = torch.FloatTensor(step_result.obs[0]).to(device)
sAgentNo = step_result.n_agents()
while (MemoryIdx < 10000):
    action = eval_net(s)
    MaxIdxOfEachAgent = torch.unsqueeze(torch.max(action, 1)[1], 1)
    ActionIdxArray = MaxIdxOfEachAgent.cpu().data.numpy()
    env.set_actions(group_name, ActionIdxArray)
    env.step()
    step_result = env.get_step_result(group_name)
    s_ = step_result.obs[0]
    s_ = torch.FloatTensor(s_).to(device)
    s_AgentNo = step_result.n_agents()
    reward = step_result.reward
    done = step_result.done
    
    if(sAgentNo == s_AgentNo ):
        for agentIdx in range(sAgentNo):
            transition = np.hstack((s[agentIdx].cpu().numpy(), 
                                    ActionIdxArray[agentIdx], reward[agentIdx], 
                                    s_[agentIdx].cpu().numpy()))
            MEMORY[MemoryIdx, :] = transition
            MemoryIdx += 1
            if(MemoryIdx == MEMORY_CAPACITY):
                break;
    s = s_
    sAgentNo= s_AgentNo
print('Done...!')

## Learn

In [None]:
# Connect with Unity
n = True
counter = 0
while(n):
    try:
        env = UnityEnvironment(file_name=None, base_port=5004)
        env.reset()
        group_name = env.get_agent_groups()[0] 
        print('Connected...')
        n = False
    except:
        counter += 1
        time.sleep(1)
        if (counter > 11):
            print('Connection failed...')
            n = False
        pass

In [None]:
# Set up NN and training hyperparameters
GAMMA = 0.95 # Penalize for each future step
UPDATE_RATE = 10  # How many steps before update optimization
EPSILON=0.3 # Chance of select randomly the maxima reward from the NN
EPSILON_Min=0 
EPSILON_DECAY=0.90 # The chance of being selected gradually decreases
TAU = 2e-3 # Update evaluation NN (soft updating)
episodes = 300
env.reset()

In [None]:
LossLst = []    # Mean loss of each episode
RewardLst = []  # Accumulated reward per episode
StepLst = []   # Total steps in episode
print("Episode:", end = "")
for episodeIdx in range(episodes):
    if (episodeIdx % 10 == 0):
        print(episodeIdx, end = ", ")
    
    # Decay of selection probability
    EPSILON = max(EPSILON_Min, EPSILON*EPSILON_DECAY) 
    
    # Get initial state of agents and convert to Tensor to use GPU
    env.reset()
    step_result = env.get_step_result(group_name)
    s = torch.FloatTensor(step_result.obs[0]).to(device)

    EpochDone = False
    rewardSum = 0
    lossSum = 0
    steps = 1
    while (not EpochDone):
        # Send initial state to NN to calculate probability of actions
        action = eval_net(s)
        # Get index of maximum Q-value for each agent
        MaxIdxOfEachAgent = torch.unsqueeze(torch.max(action, 1)[1], 1)
        ActionIdxArray = MaxIdxOfEachAgent.cpu().data.numpy()
        # Probability of performing random action
        if (np.random.uniform() < EPSILON):   # Epsilon greedy rule
            for i in range(ActionIdxArray.shape[0]):
                ActionIdxArray[i]= np.random.randint(N_ACTIONS) 
        env.set_actions(group_name, ActionIdxArray)
        env.step()
        step_result = env.get_step_result(group_name)
        s_ = step_result.obs[0]
        s_ = torch.FloatTensor(s_).to(device)
        s_AgentNo = step_result.n_agents()
        reward = step_result.reward
        rewardSum = rewardSum + np.average(reward)
        done = step_result.done
        # Check whether any agent is done
        for elt in done:
            if elt:  
                EpochDone = True

        # Check the integrity of the number of agents
        if(sAgentNo == s_AgentNo ):
            for agentIdx in range(sAgentNo):
                transition = np.hstack((s[agentIdx].cpu().numpy(), ActionIdxArray[agentIdx], 
                                        reward[agentIdx], s_[agentIdx].cpu().numpy()))
                MEMORY[MemoryIdx%MEMORY_CAPACITY,:] = transition
                MemoryIdx += 1
        s = s_
        sAgentNo= s_AgentNo

        # Learn every UPDATE_EVERY time steps
        if(steps % UPDATE_RATE == 0):
            # Sample a mini-batch from the replay memory.
            # The sample must be random to let the target NN converge properly; otherwise,
            # the target NN could converge only for the last steps and forget everything else.
            sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
            b_memory = MEMORY[sample_index, :]
            b_s = torch.FloatTensor(b_memory[:, :N_STATES]).to(device)
            b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int)).to(device)
            b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2]).to(device)
            b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:]).to(device)
            q_eval = eval_net(b_s).gather(1, b_a)
            q_next = target_net(b_s_).detach()
            q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
            loss = loss_func(q_eval, q_target)
            lossSum = lossSum + float(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Perform soft update in target NN
            for target_param, local_param in zip(target_net.parameters(), eval_net.parameters()):
                target_param.data.copy_(TAU*local_param.data + (1.0-TAU)*target_param.data)
    
        steps += 1
        
    # Record data
    LossLst.append(lossSum/steps)
    RewardLst.append(rewardSum/steps)
    StepLst.append(steps)

In [None]:
# If the agent has learnt, close the environment
env.close()

In [None]:
# Plot rewards and steps
fig = plt.figure(figsize=(18,18)) 
plt.rc('font', size=20)
plt.rc('axes', titlesize=20)

plt.subplot(3, 1, 1)
plt.plot(RewardLst)
plt.grid(True)
plt.ylabel("Reward")

plt.subplot(3, 1, 2)
plt.plot(StepLst)
plt.grid(True)
plt.ylabel("Steps")

plt.subplot(3, 1, 3)
plt.plot(LossLst)
plt.grid(True)
plt.ylabel("Loss")

plt.show()

In [None]:
# Save NN parameters if needed
torch.save(eval_net.state_dict(), "dqn_as.dat")

# Test NN performance

In [None]:
# Crate testing NN and load dictionary of parameters through transfer learning
test_net= Net().to(device)
test_net.load_state_dict(torch.load('dqn_as.dat'))
print(test_net)

In [None]:
# Connect with Unity
n = True
counter = 0
while(n):
    try:
        env = UnityEnvironment(file_name=None, base_port=5004)
        env.reset()
        group_name = env.get_agent_groups()[0] 
        print('Connected...')
        n = False
    except:
        counter += 1
        time.sleep(1)
        if (counter > 11):
            print('Connection failed...')
            n = False
        pass

In [None]:
RewardLst1 = []  # Accumulated reward
StepLst1 = []    # Steps per episode
print("Episode:", end = "")
time.sleep(1)
for episodeIdx in range(15):
    env.reset()
    time.sleep(0.5)
    print(episodeIdx, end = ", ")
    
    # Get initial state of agent
    step_result = env.get_step_result(group_name)
    s = torch.FloatTensor(step_result.obs[0]).to(device)
    
    EpochDone = False
    rewardSum = 0
    lossSum = 0
    steps = 1
    while (not EpochDone):
        action = test_net(s)
        MaxIdxOfEachAgent = torch.unsqueeze(torch.max(action, 1)[1], 1)
        ActionIdxArray = MaxIdxOfEachAgent.cpu().data.numpy()
        env.set_actions(group_name, ActionIdxArray)
        env.step()
        step_result = env.get_step_result(group_name)
        s_ = step_result.obs[0]
        s_ = torch.FloatTensor(s_).to(device)
        reward = step_result.reward
        rewardSum = rewardSum + np.average(reward)
        done = step_result.done
        for elt in done:
            if elt:
                EpochDone = True
        s = s_
        steps += 1
    # Record data
    RewardLst1.append(rewardSum/steps)
    StepLst1.append(steps)
env.close()

In [None]:
# Plot reward and steps per episode
fig = plt.figure(figsize=(18,18)) 
plt.rc('font', size=20)
plt.rc('axes', titlesize=20)

plt.subplot(2, 1, 1)
plt.plot(RewardLst1)
plt.grid(True)
plt.ylabel("Reward")

plt.subplot(2, 1, 2)
plt.plot(StepLst1)
plt.grid(True)
plt.ylabel("Steps")

plt.show()

In [None]:
# Print summary of NN
summary(test_net, input_size=(3, 50, 25, 20, 3))