In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [8]:
#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20
class Agent(nn.Module):
    def __init__(self, state_dim,action_dim,learning_rate):
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        self.memory = []

        self.fc1 = nn.Linear(self.state_dim,256)
        self.policy = nn.Linear(256, self.action_dim)
        self.value = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
        
    def get_action(self,x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = self.policy(x)
        prob = F.softmax(x, dim = softmax_dim)
        return prob
    
    def get_value(self,x):
        x = F.relu(self.fc1(x))
        x = self.value(x)
        return x
    
    def put_data(self,data):
        self.memory.append(data)
        
    def make_batch(self):
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],[],[],[],[],[]
        for data in self.memory:
            state,action,reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            prob_list.append([prob])
            next_state_list.append(next_state)
            done_mask = 0 if done else 1
            done_list.append([done_mask])
        self.memory = []
        
        s,a,r,next_s,done_mask,prob = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),torch.tensor(reward_list),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(done_list,dtype = torch.float),\
                                        torch.tensor(prob_list)
        return s,a,r,next_s,done_mask,prob
    
    def train(self):
        state,action,reward, next_state,done_mask,action_prob = self.make_batch()

        for i in range(K_epoch):
            td_error = reward + gamma * self.get_value(next_state) * done_mask
            delta = td_error - self.get_value(state)
            delta = delta.detach().numpy()
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            advantage = torch.tensor(advantage_list,dtype = torch.float)
            
            
            now_action = self.get_action(state,softmax_dim = 1)
            now_action = now_action.gather(1,action)
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + F.smooth_l1_loss(self.get_value(state),td_error.detach())
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
    

In [9]:
import numpy as np
import os
from Building import Building
#from Agent import Agent
import time
#====================================================================================


#====================================================================================
#Building Setting
lift_num = 1
buliding_height = 5
max_people_in_floor = 8
max_people_in_elevator = 10

add_people_at_step = 25
add_people_prob = 0.8

#Create building with 4 elevators, height 10, max people 30 in each floor
building = Building(lift_num, buliding_height, max_people_in_floor,max_people_in_elevator)

#Agent controls each elevator
#agent = Agent(buliding_height, lift_num, 4)
#agent.reload(280)
#The goal is to bring down all the people in the building to the ground floor

epochs = 1000
max_steps = 100
global_step = 0
T_horizon = 20
reward_list = []
print_interval = 20

In [10]:
epochs = 10000

In [14]:
model = Agent((buliding_height)+ max_people_in_elevator + (lift_num *2),4,learning_rate)
print_interval = 20
ave_reward = 0 

In [15]:
#model.load_state_dict(torch.load("./model_weights/model_4000"))

In [16]:
for epoch in range(epochs):
    building.empty_building()
    while building.target == 0 :
        building.generate_people(add_people_prob)
    first_state = building.target
    state = building.get_state()
    done = False
    global_step = 0
    while not done:
        for t in range(T_horizon):
            global_step += 1
            if (global_step % 25 == 0) & global_step > 0 :
                #building.generate_people(add_people_prob/2)
                pass
            action_prob = model.get_action(torch.from_numpy(np.array(state)).float())
            m = Categorical(action_prob)
            action = m.sample().item()
            building.perform_action([action])
            reward = building.get_reward() 
            
            next_state = building.get_state()
            finished = next_state.copy()
            del finished[-2:]
            if (sum(finished) == 0.0) :
                reward = 100. #* building.target
                done = True
            #print(sum(finished))
            #print('global_step : ',global_step,'state : ',state, 'action : ', action, 'reward : ',reward/float(first_state), 'done : ',done)
            #print('global_step : ',global_step,'state : ',state, 'action : ', action, 'reward : ',reward/10., 'done : ',done)
            #model.put_data((state, action, reward/float(first_state), next_state, action_prob[action].item(), done))
            model.put_data((state, action, reward/10.0, next_state, action_prob[action].item(), done))
            state = next_state
            
            if done or (global_step > 300):
                done = True
                break

        model.train()
    ave_reward += global_step 
    #print("Epoch: %d Step: %d Average Reward: %.4f"%(epoch, global_step, ave_reward/global_step))
    if epoch%print_interval==0 and epoch!=0:
        print("# of episode :{}, avg score : {:.1f}".format(epoch, ave_reward/print_interval))
        ave_reward = 0
    if (epoch % 100 == 0 )& (epoch != 0):
        torch.save(model.state_dict(), './model_weights/model_'+str(epoch))
    reward_list.append(global_step)

# of episode :20, avg score : 261.3
# of episode :40, avg score : 281.4
# of episode :60, avg score : 293.9
# of episode :80, avg score : 288.2
# of episode :100, avg score : 268.0
# of episode :120, avg score : 251.4
# of episode :140, avg score : 229.8
# of episode :160, avg score : 161.2
# of episode :180, avg score : 119.5
# of episode :200, avg score : 121.7
# of episode :220, avg score : 111.0
# of episode :240, avg score : 119.8
# of episode :260, avg score : 103.1
# of episode :280, avg score : 101.9


KeyboardInterrupt: 

In [None]:
'''
for epoch in range(epochs):
    building.empty_building()
    while building.target == 0 :
        building.generate_people(add_people_prob)
    state = building.get_state()
    done = False
    global_step = 0
    while not done:
        for t in range(T_horizon):
            global_step += 1
            if (global_step % 25 == 0) & global_step > 0 :
                #building.generate_people(add_people_prob/2)
                pass
            prev_people = building.get_arrived_people()
            action_prob = model.get_action(torch.from_numpy(np.array(state)).float())
            m = Categorical(action_prob)
            action = m.sample().item()
            building.perform_action([action])
            reward = building.get_reward(prev_people) 
            
            next_state = building.get_state()
            finished = next_state.copy()
            del finished[-2]
            if (sum(finished) == 0.0):
                reward = 100
                done = True
            model.put_data((state, action, reward/10., next_state, action_prob[action].item(), done))
            state = next_state

            if done or global_step > 300:
                done = True
                break
        
        model.train()
        #raise Exception()
    ave_reward += global_step 
    #print("Epoch: %d Step: %d Average Reward: %.4f"%(epoch, global_step, ave_reward/global_step))
    if epoch%print_interval==0 and epoch != 0:
        print("# of episode :{}, avg score : {:.1f}".format(epoch, ave_reward/print_interval))
        ave_reward = 0
    if (epoch % 1000 == 0 )& (epoch != 0):
        torch.save(model.state_dict(), './model_weights/model_'+str(epoch))
    reward_list.append(global_step)
'''

In [14]:
building.empty_building()
building.generate_people(0.8)

In [22]:
building.perform_action([3])
building.print_building(0)

= Floor #04 =            
=  Waiting  =             
=    003    =
= Floor #03 =            
=  Waiting  =             
=    002    =
= Floor #02 =            
=  Waiting  =             
=    002    =
= Floor #01 =            
=  Waiting  =             
=    000    =
= Floor #00 =   Lift #0  
=  Arrived  =     00     
=    000    =

People to move: 10 
Total # of people: 10
Step: 0
state :  [0.0, 0.0, 0.25, 0.25, 0.375, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
now reward :  -4


In [9]:
import matplotlib.pyplot as plt


[<matplotlib.lines.Line2D at 0x1bc485ccb70>]

In [10]:
%matplotlib tk
plt.plot(reward_list)

[<matplotlib.lines.Line2D at 0x1bc4a41ae80>]

In [18]:
building.empty_building()

In [19]:
building.generate_people(0.8)

In [20]:
building.print_building(0)

= Floor #04 =            
=  Waiting  =             
=    001    =
= Floor #03 =            
=  Waiting  =             
=    004    =
= Floor #02 =            
=  Waiting  =             
=    001    =
= Floor #01 =            
=  Waiting  =             
=    005    =
= Floor #00 =   Lift #0  
=  Arrived  =     00     
=    005    =

People to move: 11 
Total # of people: 16
Step: 0
state :  [0.625, 0.625, 0.125, 0.5, 0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
now reward :  -16


= Floor #04 =   Lift #0  
=  Waiting  =     00     
=    000    =
= Floor #03 =            
=  Waiting  =             
=    000    =
= Floor #02 =            
=  Waiting  =             
=    000    =
= Floor #01 =            
=  Waiting  =             
=    000    =
= Floor #00 =            
=  Arrived  =             
=    000    =

People to move: 16 
Total # of people: 16
Step: 0
state :  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.0]
now reward :  1
