In [1]:
%load_ext autoreload
%autoreload 2

Deep Q Network for UAV-0 MIMO environment

In this notebook, a DQN network is implemented for openAI gym's UAV environment



In [2]:
import gym
import gym_uav
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
import math

from tensorboardX import SummaryWriter

ModuleNotFoundError: No module named 'tensorboardX'

In [None]:
log = gym.logger
log.set_level(gym.logger.INFO)

#Tensorboard Writer
writer = SummaryWriter()

env = gym.make('uav-v0')
env.seed(0)
state_size = env.obs_space.shape[0]
action_size = env.act_space.n
print('State shape: ', env.obs_space.shape)
print('Number of actions: ', env.act_space.n)

In [None]:
from Source.dqn_agent import Agent

agent = Agent(state_size=state_size, action_size=action_size, seed=0)

# watch an untrained agent
state = env.reset()
print(state)
for j in range(5):
    action, qval = agent.act(state)
    #print("Action: {}, Qval: {}".format(action, qval))
    #env.render()
    #print("[NB] action: {}".format(action))
    state, reward, done, _ = env.step(action)
    if done:
        break 
        
env.close()

Train the Agent with DQN

In [None]:
def dqn_train(n_episodes=8000, eps_start=1.000, eps_end=.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []      #list containing scores from each episode
    scores_window = deque(maxlen=100) #last 100 scores
    eps = eps_start
    max_t = 5
    ep_qvals = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset() # normalized state
        score = 0.0
        ep_qval = []
        ep_loss = []
        for t in range(max_t):
            action, qval = agent.act(state, eps)
            
            #print("Action: {}, Qval: {}".format(action, qval))
            next_state, reward, done, _ = env.step(action)
            #print("reward: ", reward)
            loss = agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            ep_qval.append(qval)
            if loss is not None:
                ep_loss.append(loss)
            if done:
                break
        scores_window.append(score)
        scores.append(score)
        ep_qvals.append(np.mean(ep_qval))
    
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        writer.add_scalar("train/avg_ep_qval", np.mean(ep_qval), i_episode)
        writer.add_scalar("train/epsilon", eps, i_episode)
        writer.add_scalar("train/ep_score", score, i_episode)
        writer.add_scalar("train/ep_loss", np.sum(ep_loss), i_episode)
        
        #eps = eps_end + (eps_start-eps_end)*math.exp(-1*i_episode/eps_decay)
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if i_episode % n_episodes == 0: #np.mean(scores_window)>=200.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
    return scores

scores = dqn_train()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Testing the DQN Agent

In [None]:
#Load the weights from file
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

test_episodes = 20
max_t = 5
gamma = 0.99

rate_acc_los_scores = []
rate_acc_exh_scores = []
learnt_val_fns = []
true_val_fns = []
test_scores = []
agent.qnetwork_local.eval()
for i_episode in range(1, test_episodes+1):
    
    learnt_rates = 0.0
    los_rates = 0.0
    exh_rates = 0.0
    state = env.reset()
    learnt_val_fn = 0.0
    true_val_fn = 0.0
    perf_score = 0
    for t in range(max_t):
        action, qval = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        
        state = state * env.high_obs
        next_state = next_state * env.high_obs
        
        curr_loc = (state[0]*np.cos(state[1]), state[0]*np.sin(state[1]))
        next_loc = (next_state[0]*np.cos(next_state[1]), next_state[0]*np.sin(next_state[1]))
        #print("Test Episode: {2}, Current Location: {0}, Next Location: {1}".format(curr_loc, next_loc, i_episode))
        #print("Learnt Action: ", test_net(state))
        
            
        next_state = next_state / env.high_obs
        
        #rate measurements
        learnt_rate = env.get_Rate()
        _,los_rate = env.get_Los_Rate(next_state)
        exh_bdir,exh_rate = env.get_Exh_Rate(next_state)
        
        if (learnt_rate > env.rate_threshold):
            perf_score +=1
        learnt_val_fn += ((gamma**t) * (learnt_rate))
        true_val_fn += ((gamma**t) * (exh_rate))
        
        print("exh dir: {0}, learnt bdir: {1}".format(exh_bdir, env.BeamSet[action]))
        print("Rwd: {3}, Learnt Rate: {0}, Exh_Rate: {1}, Los_Rate: {2}".format(learnt_rate, exh_rate, los_rate, reward))

        
        #Move to next_state
        state = next_state
        
        learnt_rates += learnt_rate
        los_rates += los_rate
        exh_rates += exh_rate
        
        if done:
            rate_acc_los = learnt_rates/ los_rates
            rate_acc_exh = learnt_rates / exh_rates
            rate_acc_los_scores.append(rate_acc_los)
            rate_acc_exh_scores.append(rate_acc_exh)
            learnt_val_fns.append(learnt_val_fn)
            true_val_fns.append(true_val_fn)
            test_scores.append(perf_score)
            writer.add_scalar("test/rate_acc_los", rate_acc_los, i_episode)
            writer.add_scalar("test/rate_acc_exh", rate_acc_exh, i_episode)
            print('\rEpisode {}\tRate_acc_los Score: {:.2f}\tRate_acc_exh Score: {:.2f}\t perf_score: {:.2f}'.format(i_episode, rate_acc_los, rate_acc_exh, perf_score, end=""))
            print("\n\n")
            break

print("Total Epsiodes {},Average Rate_Acc_los: {:.2f}, Average Rate_Acc_exh: {:.2f}\n".format(test_episodes, np.mean(rate_acc_los_scores), np.mean(rate_acc_exh_scores)))



In [None]:
# plot the histogram
w=0.3
fig = plt.figure(figsize=(20,20))
x_axis = np.arange(test_episodes)
ax = fig.add_subplot(111)
rects1 = ax.bar(x_axis-w, learnt_val_fns, w, color='b')
rects2= ax.bar(x_axis, true_val_fns, w, color='g')
#plt.legend(loc='upper right')
ax.legend( (rects1[0], rects2[0]), ('rl_val_fn', 'exh_val_fn'), loc='upper right' )
plt.ylabel('Value Function')
plt.xlabel('Episode #')
ax.set_xticks(x_axis)

def autolabel(rects):
    for rect in rects:
        h = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., 1.02*h, '%.2f'%np.around(h, decimals=2),
                ha='center', va='bottom')

ax.axhline(np.mean(learnt_val_fns), color='b', linewidth=0.75, linestyle='--')
ax.axhline(np.mean(true_val_fns), color='g', linewidth=0.75, linestyle='--')
autolabel(rects1)
autolabel(rects2)


writer.add_figure("test/hist_val_fns", fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,20))
x_axis = np.arange(test_episodes)
ax = fig.add_subplot(111)
rects1 = ax.bar(x_axis, test_scores, w, color='b')

#ax.legend( (rects1[0]), ('rl_rate>rate_threshold'), loc='upper right' )
plt.ylabel('Test Episode Score')
plt.xlabel('Episode #')
ax.set_xticks(x_axis)
ax.axhline(np.mean(test_scores), color='b', linewidth=0.75, linestyle='--')

writer.add_figure("test/ep_perf_score", fig)
plt.show()

In [None]:
#Load the weights from file
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

test_episodes = 20
max_t = 5
gamma = 0.99


agent.qnetwork_local.eval()

#Plotting a random Test episode
    
learnt_rates = []
los_rates = []
exh_rates = []
ue_loc = []
state = env.reset()
learnt_val_fn = 0.0
true_val_fn = 0.0
for t in range(max_t):
    action, qval = agent.act(state)
    next_state, reward, done, _ = env.step(action)

    state = state * env.high_obs
    next_state = next_state * env.high_obs

    curr_loc = (state[0]*np.cos(state[1]), state[0]*np.sin(state[1]))
    next_loc = (next_state[0]*np.cos(next_state[1]), next_state[0]*np.sin(next_state[1]))
    #print("Test Episode: {2}, Current Location: {0}, Next Location: {1}".format(curr_loc, next_loc, i_episode))
    #print("Learnt Action: ", test_net(state))

    next_state = next_state / env.high_obs

    #rate measurements
    learnt_rate = env.get_Rate()
    _,los_rate = env.get_Los_Rate(next_state)
    exh_bdir,exh_rate = env.get_Exh_Rate(next_state)

    learnt_val_fn += ((gamma**t) * (learnt_rate))
    true_val_fn += ((gamma**t) * (exh_rate))

    print("exh dir: {0}, learnt bdir: {1}".format(exh_bdir, env.BeamSet[action]))
    print("Rwd: {3}, Learnt Rate: {0}, Exh_Rate: {1}, Los_Rate: {2}".format(learnt_rate, exh_rate, los_rate, reward))
    
    learnt_rates.append(learnt_rate)
    exh_rates.append(exh_rate)
    los_rates.append(los_rate)
    ue_loc.append(str([np.around(next_loc[0], decimals=3), np.around(next_loc[1], decimals=3)]))

fig=plt.figure(figsize=(12,12))
plt.plot(ue_loc, learnt_rates,'bx-', ue_loc, los_rates, 'r', ue_loc, exh_rates, 'g.-')
plt.legend(["rl_rate", "los_rate", "exh_rate"])
plt.show()


In [None]:
parameters = agent.qnetwork_local.parameters()
print('layer1: ', agent.qnetwork_local.hidden_layers[0].weight)
print('layer2: ', agent.qnetwork_local.hidden_layers[1].weight)
#print('layer3: ', agent.qnetwork_local.hidden_layers[2].weight)
print('output layer: ', agent.qnetwork_local.output.weight)