In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [132]:
#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20
class Agent(nn.Module):
    def __init__(self, state_dim,action_dim,learning_rate):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.tau = 0.001
        super(Agent,self).__init__()
        self.memory = []

        self.fc1 = nn.Linear(self.state_dim,256)
        self.policy = nn.Linear(256, self.action_dim)
        
        self.fc2 = nn.Linear(self.state_dim,256)
        self.value = nn.Linear(256, 5)
        
        self.fc2_target = nn.Linear(self.state_dim,256)
        self.value_target = nn.Linear(256,5)
        
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
        self.fc2_target.bias = self.fc2.bias
        self.fc2_target.weight = self.fc2.weight
        
        self.value_target.bias = self.value.bias
        self.value_target.weight = self.value.weight
    def get_action(self,x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = self.policy(x)
        prob = F.softmax(x, dim = softmax_dim)
        return prob
    
    def get_value(self,x):
        x = F.relu(self.fc2(x))
        x = self.value(x)
        return x
    
    def get_target_value(self,x):
        x = F.relu(self.fc2_target(x))
        x = self.value_target(x)
        return x
    
    def put_data(self,data):
        self.memory.append(data)
    
    
        '''    
        def soft_update(self):
            print((self.fc2_target.bias * (1.0 - self.tau) + self.fc2.bias * self.tau).detach())
            self.fc2_target.bias = (self.fc2_target.bias * (1.0 - self.tau) + self.fc2.bias * self.tau).detach()

            self.fc2_target.weight = self.fc2.weight.bias * (1.0 - self.tau) + self.fc2.weight * self.tau

            self.value_target.bias = self.value.bias * (1.0 - self.tau) + self.value.bias * self.tau
            self.value_target.weight = self.value.weight * (1.0 - self.tau) + self.value.weight * self.tau
        '''
    def make_batch(self):
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],[],[],[],[],[]
        for data in self.memory:
            state,action,reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            prob_list.append([prob])
            next_state_list.append(next_state)
            done_mask = 0 if done else 1
            done_list.append([done_mask])
        self.memory = []
        
        s,a,r,next_s,done_mask,prob = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),torch.tensor(reward_list),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(done_list,dtype = torch.float),\
                                        torch.tensor(prob_list)
        return s,a,r,next_s,done_mask,prob
    
    def train(self):
        state,action,reward, next_state,done_mask,action_prob = self.make_batch()

        for i in range(K_epoch):
            global a
            global b
            next_get_value = self.get_value(next_state)
            now_get_value = self.get_value(state)
            td_error = reward + gamma * next_get_value.mean(-1) * done_mask
            delta = td_error - now_get_value.mean(-1)
            delta = delta.detach().numpy()
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            advantage = torch.tensor(advantage_list,dtype = torch.float)
            
            
            now_action = self.get_action(state,softmax_dim = 1)
            now_action = now_action.gather(1,action)
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            
            
            target_net_td_error = reward + gamma * self.get_target_value(next_state) * done_mask
            target_net_td_error =target_net_td_error.reshape(-1,5,1).repeat(1,1,5)
            normal_net = self.get_value(state)
            normal_net = normal_net.reshape(-1,1,5).repeat(1,5,1)
            
            value_loss = target_net_td_error - normal_net
            
            
            hubber_1 = torch.where(value_loss>=0,torch.tensor([0]),torch.tensor([1])).float() * torch.tensor([0.1-1,0.3-1,0.5-1,0.7-1,0.9-1])
            hubber_2 = torch.where(value_loss<0,torch.tensor([0]),torch.tensor([1])).float() * torch.tensor([0.1,0.3,0.5,0.7,0.9]).float()
            hubber = hubber_1 + hubber_2
            
            value_loss = value_loss * hubber
            
            
            value_loss = torch.sum(value_loss,dim = -1)
            value_loss = value_loss.mean()#torch.mean()
            
            loss = - torch.min(surr1,surr2).mean() + value_loss
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
    

In [133]:
import numpy as np
import os
from Building import Building
#from Agent import Agent
import time
#====================================================================================


#====================================================================================
#Building Setting
lift_num = 1
buliding_height = 5
max_people_in_floor = 8
max_people_in_elevator = 10

add_people_at_step = 25
add_people_prob = 0.8

#Create building with 4 elevators, height 10, max people 30 in each floor
building = Building(lift_num, buliding_height, max_people_in_floor,max_people_in_elevator)

#Agent controls each elevator
#agent = Agent(buliding_height, lift_num, 4)
#agent.reload(280)
#The goal is to bring down all the people in the building to the ground floor

epochs = 1000
max_steps = 100
global_step = 0
T_horizon = 20
reward_list = []
print_interval = 20

In [148]:
model = Agent((buliding_height)+ max_people_in_elevator + (lift_num *2),4,learning_rate)
print_interval = 20
ave_reward = 0

def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - tau) + param.data * tau
        )



In [149]:
for epoch in range(epochs):
    building.empty_building()
    while building.target == 0 :
        building.generate_people(add_people_prob)
    first_state = building.target
    state = building.get_state()
    done = False
    global_step = 0
    while not done:
        for t in range(T_horizon):
            global_step += 1
            if (global_step % 25 == 0) & global_step > 0 :
                #building.generate_people(add_people_prob/2)
                pass
            action_prob = model.get_action(torch.from_numpy(np.array(state)).float())
            m = Categorical(action_prob)
            action = m.sample().item()
            building.perform_action([action])
            reward = building.get_reward() 
            
            next_state = building.get_state()
            finished = next_state.copy()
            del finished[-2:]
            if (sum(finished) == 0.0) :
                reward = 100. #* building.target
                done = True
            #print(sum(finished))
            #print('global_step : ',global_step,'state : ',state, 'action : ', action, 'reward : ',reward/float(first_state), 'done : ',done)
            #print('global_step : ',global_step,'state : ',state, 'action : ', action, 'reward : ',reward/10., 'done : ',done)
            #model.put_data((state, action, reward/float(first_state), next_state, action_prob[action].item(), done))
            model.put_data((state, action, reward/100.0, next_state, action_prob[action].item(), done))
            state = next_state
            
            if done or (global_step > 300):
                done = True
                break

        model.train()
        soft_update(model.fc2,model.fc2_target,0.001)
        soft_update(model.value,model.value_target,0.001)
    ave_reward += global_step 
    #print("Epoch: %d Step: %d Average Reward: %.4f"%(epoch, global_step, ave_reward/global_step))
    if epoch%print_interval==0 and epoch!=0:
        print("# of episode :{}, avg score : {:.1f}".format(epoch, ave_reward/print_interval))
        ave_reward = 0
    if (epoch % 100 == 0 )& (epoch != 0):
        torch.save(model.state_dict(), './model_weights/model_'+str(epoch))
    reward_list.append(global_step)

# of episode :20, avg score : 191.2
# of episode :40, avg score : 243.2
# of episode :60, avg score : 169.1
# of episode :80, avg score : 207.2
# of episode :100, avg score : 214.7
# of episode :120, avg score : 126.0
# of episode :140, avg score : 143.0
# of episode :160, avg score : 87.7
# of episode :180, avg score : 92.0
# of episode :200, avg score : 69.7
# of episode :220, avg score : 78.7
# of episode :240, avg score : 101.2
# of episode :260, avg score : 129.4
# of episode :280, avg score : 85.0
# of episode :300, avg score : 138.5
# of episode :320, avg score : 76.9
# of episode :340, avg score : 117.7
# of episode :360, avg score : 77.0
# of episode :380, avg score : 96.2
# of episode :400, avg score : 80.2
# of episode :420, avg score : 80.2
# of episode :440, avg score : 68.9
# of episode :460, avg score : 78.3
# of episode :480, avg score : 82.2
# of episode :500, avg score : 72.5
# of episode :520, avg score : 98.2
# of episode :540, avg score : 93.8
# of episode :560, av

KeyboardInterrupt: 

In [144]:
model

Agent(
  (fc1): Linear(in_features=17, out_features=256, bias=True)
  (policy): Linear(in_features=256, out_features=4, bias=True)
  (fc2): Linear(in_features=17, out_features=256, bias=True)
  (value): Linear(in_features=256, out_features=5, bias=True)
  (fc2_target): Linear(in_features=17, out_features=256, bias=True)
  (value_target): Linear(in_features=256, out_features=5, bias=True)
)

In [136]:
torch.tensor([1,2,3]).repeat(2,1)

tensor([[1, 2, 3],
        [1, 2, 3]])

In [57]:
a.shape

torch.Size([20, 5])

In [79]:
b.reshape(-1,5,1).repeat(1,1,5)[0]

tensor([[-0.0026, -0.0026, -0.0026, -0.0026, -0.0026],
        [ 0.0585,  0.0585,  0.0585,  0.0585,  0.0585],
        [ 0.0302,  0.0302,  0.0302,  0.0302,  0.0302],
        [-0.0322, -0.0322, -0.0322, -0.0322, -0.0322],
        [ 0.0053,  0.0053,  0.0053,  0.0053,  0.0053]],
       grad_fn=<SelectBackward>)

In [85]:
torch.sum(b.reshape(-1,5,1).repeat(1,1,5)[0],dim=-1)

tensor([-0.0132,  0.2926,  0.1511, -0.1609,  0.0267], grad_fn=<SumBackward2>)

In [84]:
torch.mean(torch.sum(b.reshape(-1,5,1).repeat(1,1,5)[0],dim=-1))

tensor(0.0592, grad_fn=<MeanBackward1>)

In [105]:
test = b.reshape(-1,5,1).repeat(1,1,5)[0] + a.reshape(-1,1,5).repeat(1,5,1)[0]

In [113]:
test_1 = torch.where(test>=0,torch.tensor([0]),torch.tensor([1])).float() * torch.tensor([0.1-1,0.3-1,0.5-1,0.7-1,0.9-1])

In [117]:
test_1 + test_2

tensor([[-0.9000,  0.3000,  0.5000, -0.3000,  0.9000],
        [ 0.1000,  0.3000,  0.5000,  0.7000,  0.9000],
        [ 0.1000,  0.3000,  0.5000, -0.3000,  0.9000],
        [-0.9000,  0.3000, -0.5000, -0.3000, -0.1000],
        [ 0.1000,  0.3000,  0.5000, -0.3000,  0.9000]])

In [114]:

test_2 = torch.where(test<0,torch.tensor([0]),torch.tensor([1])).float() * torch.tensor([0.1,0.3,0.5,0.7,0.9]).float()

In [122]:
test_1+test_2

tensor([[-0.9000,  0.3000,  0.5000, -0.3000,  0.9000],
        [ 0.1000,  0.3000,  0.5000,  0.7000,  0.9000],
        [ 0.1000,  0.3000,  0.5000, -0.3000,  0.9000],
        [-0.9000,  0.3000, -0.5000, -0.3000, -0.1000],
        [ 0.1000,  0.3000,  0.5000, -0.3000,  0.9000]])

In [120]:
test_1

tensor([[-0.9000, -0.0000, -0.0000, -0.3000, -0.0000],
        [-0.0000, -0.0000, -0.0000, -0.0000, -0.0000],
        [-0.0000, -0.0000, -0.0000, -0.3000, -0.0000],
        [-0.9000, -0.0000, -0.5000, -0.3000, -0.1000],
        [-0.0000, -0.0000, -0.0000, -0.3000, -0.0000]])

In [119]:
test_2

tensor([[0.0000, 0.3000, 0.5000, 0.0000, 0.9000],
        [0.1000, 0.3000, 0.5000, 0.7000, 0.9000],
        [0.1000, 0.3000, 0.5000, 0.0000, 0.9000],
        [0.0000, 0.3000, 0.0000, 0.0000, 0.0000],
        [0.1000, 0.3000, 0.5000, 0.0000, 0.9000]])

In [70]:
b.reshape(-1,5,1).repeat(1,1,5)[0] * torch.tensor([1,2,3,4,5]).float()

tensor([[-0.0026, -0.0053, -0.0079, -0.0106, -0.0132],
        [ 0.0585,  0.1170,  0.1755,  0.2341,  0.2926],
        [ 0.0302,  0.0605,  0.0907,  0.1209,  0.1511],
        [-0.0322, -0.0644, -0.0965, -0.1287, -0.1609],
        [ 0.0053,  0.0107,  0.0160,  0.0213,  0.0267]], grad_fn=<MulBackward0>)

In [103]:
a.reshape(-1,1,5).repeat(1,5,1)[0]

tensor([[-0.0026,  0.0585,  0.0302, -0.0322,  0.0053],
        [-0.0026,  0.0585,  0.0302, -0.0322,  0.0053],
        [-0.0026,  0.0585,  0.0302, -0.0322,  0.0053],
        [-0.0026,  0.0585,  0.0302, -0.0322,  0.0053],
        [-0.0026,  0.0585,  0.0302, -0.0322,  0.0053]],
       grad_fn=<SelectBackward>)

In [51]:
a.repeat()

tensor([[[-0.0026,  0.0585,  0.0302, -0.0322,  0.0053],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0445, -0.0872, -0.0112, -0.0145,  0.0065],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
         [ 0.0445, -0.0872, -0.0112, -0.0145,  0.0065],
         [ 0.0584, -0.0220,  0.0260, -0.0273, -0.0270],
         [ 0.0584, -0.0220,  0.0260, -0.0273, -0.0270],
         [ 0.0473,  0.0051,  0.0144, -0.0305, -0

tensor([[-0.0026,  0.0585,  0.0302, -0.0322,  0.0053],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0445, -0.0872, -0.0112, -0.0145,  0.0065],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010]],
       grad_fn=<IndexBackward>)

In [44]:
b

tensor([[-0.0026,  0.0585,  0.0302, -0.0322,  0.0053],
        [-0.0026,  0.0585,  0.0302, -0.0322,  0.0053],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0445, -0.0872, -0.0112, -0.0145,  0.0065],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0435, -0.0604, -0.0112, -0.0123, -0.0010],
        [ 0.0445, -0.0872, -0.0112, -0.0145,  0.0065],
        [ 0.0584, -0.0220,  0.0260, -0.0273, -0.0270],
        [ 0.0584, -0.0220,  0.0260, -0.0273, -0.0270],
        [ 