In [None]:
##Intelligent Trajectory Design in UAV-Aided Communications With Reinforcement Learning

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import collections 
import torch.optim as optim
from google.colab import drive
drive.mount('/content/drive') #계정을 연동해야만 내가 만든 py에 접근할 수 있다.
save_model_path = f"/content/drive/MyDrive/Colab Notebooks/results/uav.pt"
save_model_path2 = f"/content/drive/MyDrive/Colab Notebooks/results/uav_v_net.pt"

In [None]:
#hyperparameters 
MINI_batch = 64
buffer_size = 100000
lr = 0.0005
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
import random as rd
class env_uav():
  def __init__(self):
    self.q = [[20.,20.,200.]]
    self.V = 6
    self.L = 3 
    self.K = 5 
    self.T = 60
    self.N = 60
    self.user = []
    self.done = False
    self.t = self.L 
    self.state = []
  
  def user_set(self):
    #user random location
    
    for k in range(self.K):
      self.user.append([rd.randint(150,180),rd.randint(150,180),0]) # 5 x 3(x,y,z)
    return self.user

  def cal_rate(self,t): #reward 구할 때
    sum = 0   # time step t에서의 sum rate를 return 즉, 이게 수신 강도이자 reward
    for i in range(self.K):#np.log2
      sum += np.log2(1+1/np.sqrt(np.power(self.q[t][0]-self.user[i][0],2)+np.power(self.q[t][1]-self.user[i][1],2)+np.power(self.q[t][2]-self.user[i][2],2)))
    return sum # 1+를 뺼까
  
  def state_rate(self,t): #state 구할 때
    sum = 0   # time step t에서의 sum rate를 return 즉, 이게 수신 강도이자 reward
    for i in range(self.K):#np.log2
      sum += np.log2(1+1/np.sqrt(np.power(self.q[t][0]-self.user[i][0],2)+np.power(self.q[t][1]-self.user[i][1],2)+np.power(self.q[t][2]-self.user[i][2],2)))
    return sum*1000 # 1+를 뺼까 log를 다시 씌울까


  def q_location(self,a,t): #t+1의 observation  
  #v, seta, low => dtype = tensor
    v , seta , low = a
    v = v.detach().numpy()*self.V; seta = seta.detach().numpy()*180; low = low.detach().numpy()*360

    self.q.append([self.q[t][0] + np.sin(seta*np.pi/180)*np.cos(low*np.pi/180)*v\
                   ,self.q[t][1] + np.sin(low*np.pi/180)*np.sin(seta*np.pi/180)*v,\
                   self.q[t][2] + np.cos(seta*np.pi/180)*v])
  
  def init_state(self):
    s_0 = self.state_rate(0) # q0에서의 e
    state = []
    for i in range(self.L):
      init_action = torch.tensor([torch.rand(1),torch.rand(1),torch.rand(1)],dtype = torch.float)
      self.q_location(init_action,i) # action받고 행동
      s_1 = self.state_rate(i+1)
      delta = (s_1 - s_0) # state가 너무 작아서 
      state.append(delta)
      s_0 = s_1

    return state

  def reset(self): # q, user,done initialize
    self.user = []
    self.q = [[20.,20.,200.]]
    user = self.user_set()
    self.done = False
    return user

  def current_uav(self): # 현재 uav 위치
    return self.q

  def step(self,a,s,t) : #action받으면 reward와 transition, done return 
    #a는 tensor
    _,delta_2,delta_3 = s
    state = []

    s_0 = self.state_rate(t)  #t에서의 rate
    self.q_location(a,t)  # t+1에서의 위치
    s_1 = self.state_rate(t+1) # t+1에서의 rate
    delta_1 = s_1 - s_0
    delta_1 = delta_1 
    delta_2 = delta_2 
    delta_3 = delta_3 

    state.append(delta_2)
    state.append(delta_3)
    state.append(delta_1)
 
    r = self.cal_rate(t)

    self.t += 1 #실제로 움직인다 -> time step이 증가 => T가 되면 종료
    if self.t >= self.T:
      self.done = True
    return state, r, self.done # list(3,) , numpy, bool
 
  def location(self,init_s,policy,q_lo):
      q_rate = 0
      s = init_s
      self.q = q_lo
        
      for i in range(3,60):
        a =policy(torch.tensor(s).float())   
        s_prime,r,_ = self.step(a,s,t) # noise가 없는 action으로 움직였을 때의 reward를 따로 계산
        q_rate += r
      
      return q_rate

In [None]:
class VN(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc_1 = nn.Linear(6,256)
    self.fc_2 = nn.Linear(256,256)
    self.fc_3 = nn.Linear(256,256)
    self.fc_4 = nn.Linear(256,256)
    self.fc_5 = nn.Linear(256,256)
    self.fc_6 = nn.Linear(256,1)

    self.optimizer = optim.Adam(self.parameters(),lr = lr)
  def forward(self,x):
    x = F.relu(self.fc_1(x))
    x = F.relu(self.fc_2(x))
    x = F.relu(self.fc_3(x))
    x = F.relu(self.fc_4(x))
    x = F.relu(self.fc_5(x))
    x = self.fc_6(x) 
    return x

class PN(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(3,256)
    self.fc2 = nn.Linear(256,256)
    self.fc3 = nn.Linear(256,256)
    self.fc4 = nn.Linear(256,256)
    self.fc5 = nn.Linear(256,256)
    self.fc6 = nn.Linear(256,3)

    self.buffer = collections.deque(maxlen = buffer_size)
    self.optimizer = optim.Adam(self.parameters(),lr = lr)

  def forward(self,x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    #x = F.relu(self.fc4(x))
    #x = F.relu(self.fc5(x))
    x = F.sigmoid(self.fc6(x)) 
    return x

  def put_data(self,transition):   
    self.buffer.append(transition)

  def sampling(self,batch_size):
    mini_batch = rd.sample(self.buffer,batch_size) #buffer에서 batch_size만큼 random하게 sampling
    #batch_size x 4
    #list, array, list, numpy, bool
    s_list , a_list , r_list , s_prime_list , done_list = [], [] ,[] ,[] ,[] 

    for transition in mini_batch:
      s, a, r, s_prime, done = transition
    
      s_list.append(s)
      a_list.append(a)
      r_list.append([r])
      s_prime_list.append(s_prime)
      done_list.append([done])
    
    return torch.tensor(s_list,dtype=torch.float), torch.tensor(a_list,dtype = torch.float),\
    torch.tensor(r_list,dtype = torch.float),torch.tensor(s_prime_list,dtype = torch.float),\
     torch.tensor(done_list,dtype = torch.float)

  def size(self):
    return len(self.buffer)

  def train(self,V_net,V_target,P_target):
    for i in range(10):
      s, a, r, s_prime, done = self.sampling(MINI_batch)

      

      a_prime = P_target.forward(s_prime) # a'
      s_a = torch.hstack((s,a)) # Q_value network의 input
      s_prime_a = torch.hstack((s_prime,a_prime))

      value_s_prime = V_target(s_prime_a) # tensor
      td_target = r + value_s_prime*done #
      value_s = V_net(s_a) # batch x 1  tensor

      V_loss = F.smooth_l1_loss(value_s,td_target.detach())
      V_net.optimizer.zero_grad()
      V_loss.backward()
      V_net.optimizer.step()
      
      a_P = self.forward(s)
      s_a2 = torch.hstack((s,a_P))
      P_loss = -torch.mean(V_net(s_a2))
      self.optimizer.zero_grad()
      P_loss.backward()
      self.optimizer.step()
  
    return V_loss.item(), P_loss.item()

In [None]:
env = env_uav() 
policy = PN()
policy.load_state_dict(torch.load(save_model_path)) # training된 parameter load 
policy_target = PN() 

value = VN()
value.load_state_dict(torch.load(save_model_path2))
value_target = VN()

policy_target.load_state_dict(policy.state_dict())
value_target.load_state_dict(value.state_dict())


M = 50000
avgv_loss =0
avgp_loss =0
sum_rate =0 
avg_t = 100
V_loss = 0
P_loss = 0
temp = []
action_list = []

for m in range(M):
  #user generate, p, done 초기화
  user = env.reset()
  #action list 초기화
  action_list = []
  #1 ~ L 까지 random action =======================================
  s = env.init_state() #(3,)
  init_s = s
  q_lo = env.current_uav()
  done = False

  for t in range(3,60):
    action =policy(torch.tensor(s).float()) 
    action_list.append((action.detach().numpy())) # noise가 없는 깨끗한 action 저장
    v, seta, low = torch.randn(1)/6, torch.rand(1)/180,torch.rand(1)/360
    #print(s)
    #print(action)
    a = action
    a.data[0] += v[0]; a.data[1] += seta[0]; a.data[2] += low[0];
    s_prime , reward, _ = env.step(a,s,t)  
    
    # list, numpy, bool
    done_num = 1.0
    if t == 60 : # 사실 for문이 다 돌면 끝난다.
      done_num =0.0
      done = True

    a = action.detach().numpy()
    policy.put_data((s,a,reward, s_prime,done_num)) # buffer에 저장
    s = s_prime

    if done == True:
      break
  
  if policy.size() > 10000:
    V_loss,P_loss = policy.train(value,value_target,policy_target)
  
  avgv_loss += V_loss
  avgp_loss += P_loss
  epi_rate = 0 # 종료 조건;
  temp_rate = 0
  #noise가 없는 action으로, 위치를 계산해야 된다.
  epi_rate = env.location(init_s,policy,q_lo)

  temp.append(epi_rate)   #avg_t마다 갱신
  sum_rate = np.sum(temp)
  temp_rate = np.max(temp)

  if temp_rate > 100.0 : #제대로 된 policy network로 reward계산을 해야된다.
    torch.save(policy.state_dict(),save_model_path)
    torch.save(value.state_dict(),save_model_path2)
    print("epi_rate",temp_rate)
    break

  if m%avg_t == 0 and m!=0 :
    #print(reward)
    policy_target.load_state_dict(policy.state_dict())
    value_target.load_state_dict(value.state_dict())
    print("episode : {}, buffer_size : {}, v_loss : {}, p_loss : {}, avg_sum_rate : {} , epi_rate = {}"\
          .format(m,policy.size(),avgv_loss/avg_t,avgp_loss/avg_t,sum_rate/avg_t,temp_rate))
    avgv_loss = 0
    avgp_loss = 0
    sum_rate = 0 
    temp = []                              # deep한 정도                                                   


In [None]:
location = env.current_uav()
location = np.array(location)
print(location)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig =  plt.figure(figsize=[10,9])
ax=fig.add_subplot(1,1,1, projection='3d')

loc = np.array(location)
user = np.array(user)
print(loc.shape)
print(user.shape)
ax.scatter(loc[:,0],loc[:,1],loc[:,2],'ro')
ax.scatter(user[:,0],user[:,1],user[:,2],'b*')
plt.show()

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


In [None]:
model = PN() # model class 정의해주고,
model.load_state_dict(torch.load(save_model_path)) #parameter load

In [None]:
#user generate, p, done 초기화
env = env_uav()
user = env.reset()
#1 ~ L 까지 random action =======================================
s = env.init_state()
sum_rate =0 

for t in range(3,60):
  action =model(torch.tensor(s).float()) 
  s_prime , reward, _ = env.step(action,s,t)  
  s = s_prime
  
for i in range(60):
    sum_rate += env.cal_rate(i)

location = env.current_uav()
location = np.array(location)


In [None]:
print(sum_rate)

In [None]:
print(user)
print(location)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig =  plt.figure(figsize=[10,9])
ax=fig.add_subplot(1,1,1, projection='3d')

loc = np.array(location)
user = np.array(user)

ax.scatter(loc[:,0],loc[:,1],loc[:,2],'ro')
ax.scatter(user[:,0],user[:,1],user[:,2],'b*')
plt.show()