In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
import numpy as np
import os
from Building import Building
#from Agent import Agent
import time
#====================================================================================


#====================================================================================
#Building Setting
lift_num = 2
buliding_height = 5
max_people_in_floor = 8
max_people_in_elevator = 10

add_people_at_step = 25
add_people_prob = 0.8

#Create building with 4 elevators, height 10, max people 30 in each floor
building = Building(lift_num, buliding_height, max_people_in_floor,max_people_in_elevator)

#Agent controls each elevator
#agent = Agent(buliding_height, lift_num, 4)
#agent.reload(280)
#The goal is to bring down all the people in the building to the ground floor

epochs = 1000
max_steps = 100
global_step = 0
T_horizon = 20
reward_list = []
print_interval = 20

In [3]:
#Hyperparameters
learning_rate = 0.0001
gamma         = 0.99
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20
class Agent(nn.Module):
    def __init__(self, state_dim,elevator_num,action_dim,learning_rate):
        self.state_dim = state_dim
        self.elevator_num = elevator_num
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        self.memory = []

        self.fc1 = nn.Linear(self.state_dim,256)
        self.fc2 = nn.Linear(256,256)
        self.policy = nn.ModuleList([nn.Linear(256, self.action_dim) for _ in range(self.elevator_num)])
        #self.policy = [nn.Linear(256, self.action_dim) for x in range(elevator_num)]
        self.value = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
        
    def get_action(self,x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        xs = [layer(x) for layer in self.policy]
        x = [F.softmax(x,dim = softmax_dim) for x in xs]
        return x

    
    def get_value(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.value(x)
        return x
    
    def put_data(self,data):
        self.memory.append(data)
        
    def make_batch(self):
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],[],[],[],[],[]
        for data in self.memory:
            state,action,reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            prob_list.append([prob])
            next_state_list.append(next_state)
            done_mask = 0 if done else 1
            done_list.append([done_mask])
        self.memory = []
        
        s,a,r,next_s,done_mask,prob = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),torch.tensor(reward_list),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(done_list,dtype = torch.float),\
                                        torch.tensor(prob_list)
        return s,a,r,next_s,done_mask,prob
    
    def train(self):
        state,action,reward, next_state,done_mask,action_prob = self.make_batch()
        
        for i in range(K_epoch):
            td_error = reward + gamma * self.get_value(next_state) * done_mask
            delta = td_error - self.get_value(state)
            delta = delta.detach().numpy()
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            advantage = torch.tensor(advantage_list,dtype = torch.float)
            
            
            now_action = self.get_action(state,softmax_dim = 1)

            now_action = torch.stack(now_action)

            action_select = np.array([[x[0][idx] for x in action] for idx in range(self.elevator_num)])
            action_select = torch.from_numpy(action_select).reshape(self.elevator_num,-1,1)

            now_action = now_action.gather(2,action_select)
            
            action_prob = action_prob.reshape(-1,2,1)
            action_prob = action_prob.permute(1,0,2)
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
        

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            
            loss = - torch.min(surr1,surr2) + F.smooth_l1_loss(self.get_value(state),td_error.detach())
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
    

In [4]:
model = Agent((buliding_height)+ (max_people_in_elevator +lift_num) * lift_num,2,4,learning_rate)
print_interval = 20
ave_reward = 0 

In [5]:
epochs = 500000

In [6]:
for epoch in range(epochs):
    building.empty_building()
    while building.target == 0 :
        building.generate_people(add_people_prob)
    state = building.get_state()
    done = False
    global_step = 0
    while not done:
        for t in range(T_horizon):
            global_step += 1
            if (global_step % 25 == 0) & global_step > 0 :
                #building.generate_people(add_people_prob/2)
                pass
            action_prob = model.get_action(torch.from_numpy(np.array(state)).float())
            m = [Categorical(x) for x in action_prob]
            action = [x.sample().item() for x in m]
            building.perform_action(action)
            reward = building.get_reward() 
            
            next_state = building.get_state()
            finished = next_state.copy()
            del finished[-4:]
            if (sum(finished) == 0.0) :
                reward = 100. #* building.target
                done = True
            #print(sum(finished))
            #print('global_step : ',global_step,'state : ',state, 'action : ', action, 'reward : ',reward/float(first_state), 'done : ',done)
            #print('global_step : ',global_step,'state : ',state, 'action : ', action, 'reward : ',reward/10., 'done : ',done)
            print([x[action[idx]].item() for idx,x in enumerate (action_prob)])
            model.put_data((state, action, reward/100.0, next_state,\
                            [x[action[idx]].item() for idx,x in enumerate (action_prob)], done)) #reward / 100.0
            state = next_state
            
            if done or (global_step > 300):
                done = True
                break

        model.train()
    ave_reward += global_step 
    #print("Epoch: %d Step: %d Average Reward: %.4f"%(epoch, global_step, ave_reward/global_step))
    if epoch%print_interval==0 and epoch!=0:
        print("# of episode :{}, avg score : {:.1f}".format(epoch, ave_reward/print_interval))
        ave_reward = 0
    if (epoch % 10000 == 0 )& (epoch != 0):
        #torch.save(model.state_dict(), './model_weights/multi_model_'+str(epoch))
        pass
    reward_list.append(global_step)

[0.24627582728862762, 0.25326475501060486]
[0.24213524162769318, 0.23320461809635162]
[0.2597779333591461, 0.25579261779785156]
[0.2517663240432739, 0.23698033392429352]
[0.2597779333591461, 0.2489701211452484]
[0.23525916039943695, 0.24893061816692352]
[0.2517663240432739, 0.23698033392429352]
[0.23527735471725464, 0.23708291351795197]
[0.2539939880371094, 0.25815433263778687]
[0.253924161195755, 0.2573404014110565]
[0.2522863447666168, 0.2583127021789551]
[0.25868380069732666, 0.23648607730865479]
[0.25807562470436096, 0.25229305028915405]
[0.2635153532028198, 0.24517987668514252]
[0.24721835553646088, 0.24753810465335846]
[0.24759717285633087, 0.24465014040470123]
[0.26330992579460144, 0.2536088228225708]
[0.229863241314888, 0.250853568315506]
[0.2591233551502228, 0.24429084360599518]
[0.26367223262786865, 0.2520831227302551]
[0.23181234300136566, 0.24994230270385742]
[0.23253586888313293, 0.25542330741882324]
[0.2599222958087921, 0.23251859843730927]
[0.2534935772418976, 0.25799185

[0.23677431046962738, 0.2544460892677307]
[0.24193398654460907, 0.22964459657669067]
[0.2417912483215332, 0.23257873952388763]
[0.2506312131881714, 0.26421234011650085]
[0.251284658908844, 0.23270848393440247]
[0.2700044810771942, 0.25246018171310425]
[0.27190855145454407, 0.2382899969816208]
[0.24288268387317657, 0.2635200619697571]
[0.26920682191848755, 0.252442330121994]
[0.24202610552310944, 0.24731378257274628]
[0.23978830873966217, 0.23504206538200378]
[0.24890568852424622, 0.26299887895584106]
[0.2422059029340744, 0.23517128825187683]
[0.24063080549240112, 0.2544127404689789]
[0.24127383530139923, 0.23504206538200378]
[0.24127383530139923, 0.23504206538200378]
[0.27003213763237, 0.23504206538200378]
[0.24890568852424622, 0.24762843549251556]
[0.23908284306526184, 0.2641361355781555]
[0.2391725480556488, 0.26411372423171997]
[0.2402455061674118, 0.2634736895561218]
[0.238753542304039, 0.25365036725997925]
[0.23941993713378906, 0.26231086254119873]
[0.2678585648536682, 0.262392759

[0.2591947615146637, 0.2405354231595993]
[0.24604099988937378, 0.24070964753627777]
[0.245768740773201, 0.23229889571666718]
[0.2673284411430359, 0.24094995856285095]
[0.26805463433265686, 0.24111106991767883]
[0.22949492931365967, 0.27232858538627625]
[0.22918711602687836, 0.24131649732589722]
[0.22883516550064087, 0.2701178789138794]
[0.25688064098358154, 0.2348385602235794]
[0.26735183596611023, 0.24166198074817657]
[0.2685093879699707, 0.2701178789138794]
[0.2292691469192505, 0.2697685658931732]
[0.24608083069324493, 0.25502490997314453]
[0.2578088939189911, 0.25502490997314453]
[0.2664104700088501, 0.25455179810523987]
[0.2664104700088501, 0.23472805321216583]
[0.2664104700088501, 0.23472805321216583]
[0.2582164704799652, 0.269260048866272]
[0.22886304557323456, 0.270098477602005]
[0.26025262475013733, 0.2355469912290573]
[0.26016950607299805, 0.2349804788827896]
[0.22919383645057678, 0.2565418481826782]
[0.26016950607299805, 0.2349804788827896]
[0.22919383645057678, 0.26944860816

KeyboardInterrupt: 