In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
import numpy as np
import os
from Building import Building
#from Agent import Agent
import time
#====================================================================================


#====================================================================================
#Building Setting
lift_num = 2
buliding_height = 5
max_people_in_floor = 8
max_people_in_elevator = 10

add_people_at_step = 25
add_people_prob = 0.8

#Create building with 4 elevators, height 10, max people 30 in each floor
env_num = 4

#Agent controls each elevator
#agent = Agent(buliding_height, lift_num, 4)
#agent.reload(280)
#The goal is to bring down all the people in the building to the ground floor

epochs = 1000
max_steps = 100
global_step = 0
T_horizon = 20
reward_list = []
print_interval = 20



In [110]:
#Hyperparameters
learning_rate = 0.0001
gamma         = 0.99
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20
class Agent(nn.Module):
    def __init__(self, state_dim,elevator_num,action_dim,learning_rate,env_num):
        self.state_dim = state_dim
        self.elevator_num = elevator_num
        self.action_dim = action_dim
        self.env_num = env_num
        super(Agent,self).__init__()
        self.memory = [[] for _ in range(self.env_num)]

        self.fc1 = nn.Linear(self.state_dim,256)
        self.fc2 = nn.Linear(256,256)
        self.policy = nn.ModuleList([nn.Linear(256, self.action_dim) for _ in range(self.elevator_num)])
        #self.policy = [nn.Linear(256, self.action_dim) for x in range(elevator_num)]
        self.value = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
        
    def get_action(self,x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        xs = [layer(x) for layer in self.policy]
        x = [F.softmax(x,dim = softmax_dim) for x in xs]
        return x

    
    def get_value(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.value(x)
        return x
    
    def put_data(self,i,data):
        self.memory[i].append(data)
        
    def make_batch(self,i):
        #print(i)
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],[],[],[],[],[]
        for data in self.memory[i]:
            state,action,reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            prob_list.append([prob])
            next_state_list.append(next_state)
            done_mask = 0 if done else 1
            done_list.append([done_mask])
        #print(prob_list)
        s,a,r,next_s,done_mask,prob = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),torch.tensor(reward_list),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(done_list,dtype = torch.float),\
                                        torch.tensor(prob_list)
        return s,a,r,next_s,done_mask,prob
    
    def train(self):
        for i in range(K_epoch):
            stop = 0
            for env in range(self.env_num):
                state,action,reward, next_state,done_mask,action_prob = self.make_batch(env)
                if state.shape[0] == 0 :
                    stop += 1
                    continue
                td_error = reward + gamma * self.get_value(next_state) * done_mask
                delta = td_error - self.get_value(state)
                delta = delta.detach().numpy()
                advantage_list = []
                advantage = 0.0
                for delta_t in delta[::-1]:
                    advantage = gamma * lmbda * advantage + delta_t[0]
                    advantage_list.append([advantage])
                advantage_list.reverse()
                if env == stop:
                    all_advantage = torch.tensor(advantage_list,dtype = torch.float)
                    all_action = action
                    all_state = state
                    all_action_prob = action_prob
                    all_td_error = td_error
                else :
                    all_advantage = torch.cat((all_advantage,torch.tensor(advantage_list,dtype= torch.float)))
                    all_action = torch.cat((all_action,action))
                    all_state = torch.cat((all_state,state))
                    all_action_prob = torch.cat((all_action_prob,action_prob))
                    all_td_error = torch.cat((all_td_error,td_error))
            

            now_action = self.get_action(all_state,softmax_dim = 1)
            #print('now_action : ', now_action.shape)
            now_action = torch.stack(now_action)
            #print('stacked now_action.shape : ', now_action.shape)
            action_select = np.array([[x[0][idx] for x in all_action] for idx in range(self.elevator_num)])
            #print('action_select.shape : ',action_select.shape)
            action_select = torch.from_numpy(action_select).reshape(self.elevator_num,-1,1)
            #print('action_select.shape : ',action_select.shape)
            now_action = now_action.gather(2,action_select)
            #print('now_action.shape : ', now_action.shape)
            action_prob = all_action_prob.reshape(-1,2,1) ##problem
            #print('action_prob.shape : ',action_prob.shape)
            action_prob = action_prob.permute(1,0,2)
            #print(now_action)
            #print(action_prob)
            #raise Exception()
            #print('now_action.shape : ',now_action.shape)
            #print('action_prob.shape : ',action_prob.shape)
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
        

            surr1 = ratio * all_advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * all_advantage
            
            loss = - torch.min(surr1,surr2) + F.smooth_l1_loss(self.get_value(all_state),all_td_error.detach())
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        self.memory = [[] for _ in range(self.env_num)]

In [111]:
def finish_checker(x):
    return x[:-2* env_num]

In [112]:
model = Agent((buliding_height)+ (max_people_in_elevator +lift_num) * lift_num,2,4,learning_rate,env_num)
print_interval = 20
ave_reward = 0 
epochs = 500000
building_list = [Building(lift_num, buliding_height, max_people_in_floor,max_people_in_elevator) for _ in range(env_num)]

In [None]:
for epoch in range(epochs):
    [building.empty_building() for building in building_list]
    [building.generate_people(add_people_prob) for building in building_list]
    state_list = [building.get_state() for building in building_list]
    
    done_list = [False for _ in range(env_num)]
    global_step = 0
    while not all(done_list):
        for t in range(T_horizon):
            global_step += 1
            #print('global_step : ',global_step, ' done_list : ',done_list)
            if (global_step % 25 == 0) & global_step > 0 :
                #building.generate_people(add_people_prob/2)
                pass
            action_prob_list = [model.get_action(torch.from_numpy(np.array(state)).float()) for state in state_list]
            m_list = [[Categorical(x) for x in action_prob] for action_prob in action_prob_list]
            action_list = [[x.sample().item() for x in m] for m in m_list]
            [building.perform_action([action_list[idx]][0]) for idx,building in enumerate(building_list)]
            reward_list = [building.get_reward() for building in building_list]
            
            next_state_list = [building.get_state() for building in building_list]
            finished_list = [next_state.copy() for next_state in next_state_list]
            finished_list = list(map(finish_checker, finished_list))
            for i in range(env_num):
                if (sum(finished_list[i]) == 0.0) : #and (done_list[i] != True):
                    reward_list[i] = 100. #* building.target
                    done_list[i] = True
            for i in range(env_num):
                #if reward_list[i] == 0 and done_list[i] == True:
                #    continue
                #([[x[action_list[idx][in_idx]].item() for in_idx,x in enumerate(prob)] \
                #for idx,prob in enumerate(action_prob_list)])
                #print([x[action_list[i][idx]].item() for idx, x in enumerate(action_prob_list[i])])
                model.put_data(i,(state_list[i], action_list[i], reward_list[i]/100.0,\
                                next_state_list[i], \
                                  [x[action_list[i][idx]].item() for idx, x in enumerate(action_prob_list[i])]
                                  #[[x[action_list[idx][in_idx]].item() for in_idx,x in enumerate(prob)]\
                                  #for idx,prob in enumerate(action_prob_list)]\
                                  
                                  , done_list[i]))
            state_list = next_state_list
            for i in range(env_num):
                if (global_step > 300):
                    done_list[i] = True
            if all(done_list) == True:
                break
        model.train()
    ave_reward += global_step 
    #print("Epoch: %d Step: %d Average Reward: %.4f"%(epoch, global_step, ave_reward/global_step))
    if epoch%print_interval==0 and epoch!=0:
        print("# of episode :{}, avg score : {:.1f}".format(epoch, ave_reward/print_interval))
        ave_reward = 0
    if (epoch % 10000 == 0 )& (epoch != 0):
        #torch.save(model.state_dict(), './model_weights/multi_model_'+str(epoch))
        pass
    reward_list.append(global_step)

# of episode :20, avg score : 183.2
# of episode :40, avg score : 239.7
# of episode :60, avg score : 238.3
# of episode :80, avg score : 171.7
# of episode :100, avg score : 152.0
# of episode :120, avg score : 134.1
# of episode :140, avg score : 132.3
# of episode :160, avg score : 113.3
# of episode :180, avg score : 103.8
# of episode :200, avg score : 109.0
# of episode :220, avg score : 103.2
# of episode :240, avg score : 81.2
# of episode :260, avg score : 79.5
# of episode :280, avg score : 92.3
# of episode :300, avg score : 77.5
# of episode :320, avg score : 84.8
# of episode :340, avg score : 85.2
# of episode :360, avg score : 73.0
# of episode :380, avg score : 77.2
# of episode :400, avg score : 80.5
# of episode :420, avg score : 67.2
# of episode :440, avg score : 68.2
# of episode :460, avg score : 76.8
# of episode :480, avg score : 66.2
# of episode :500, avg score : 67.5
# of episode :520, avg score : 57.9
# of episode :540, avg score : 60.8
# of episode :560, av

In [105]:
sum([0.2487, 0.2566, 0.2559, 0.2330])

0.9942

In [84]:
action_list

[[3, 1], [1, 1], [1, 1], [0, 1]]

In [93]:
test = 0

[0.25725942850112915, 0.2661406099796295]

In [None]:
[x[action[idx]].item() for idx,x in enumerate (action_prob)]

In [86]:
[[x[action_list[idx][in_idx]].item() for in_idx,x in enumerate(prob)]\
                                  for idx,prob in enumerate(action_prob_list)]

[[0.25725942850112915, 0.2661406099796295],
 [0.2514480948448181, 0.26537156105041504],
 [0.25304436683654785, 0.26812371611595154],
 [0.22482609748840332, 0.2689354717731476]]

In [None]:
a

In [90]:
[x[action_list[idx]].item() for idx,x in enumerate (action_prob_list)]

TypeError: list indices must be integers or slices, not list

In [87]:
action_prob_list[0]

[tensor([0.2307, 0.2534, 0.2586, 0.2573], grad_fn=<SoftmaxBackward>),
 tensor([0.2448, 0.2661, 0.2365, 0.2526], grad_fn=<SoftmaxBackward>)]