In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [12]:
#Hyperparameters
learning_rate = 0.0001
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20
class Agent(nn.Module):
    def __init__(self, state_dim,action_dim,learning_rate):
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        self.memory = []

        '''
        self.fc1 = nn.Linear(self.state_dim,256)
        self.policy = nn.Linear(256, self.action_dim)
        self.value = nn.Linear(256, 1)
        '''


        #self.conv_layer_1_1 = nn.Conv1d(in_channels = 1,out_channels = 8, kernel_size = 1, stride = 1,padding = 0,bias = False)
        #self.conv_layer_pool_1_1 = nn.MaxPool1d(8)
        #self.conv_layer_1_2 = nn.Conv1d(in_channels = 8,out_channels = 8, kernel_size = 1, stride = 1,padding = 0,bias = False)
        
        #self.conv_layer_2_1 = nn.Conv1d(in_channels = 1,out_channels = 8, kernel_size = 1, stride = 1,padding = 0,bias = False)
        #self.conv_layer_pool_2_1 = nn.MaxPool1d(8)
        #self.conv_layer_2_2 = nn.Conv1d(in_channels = 8,out_channels = 8, kernel_size = 1, stride = 1,padding = 0,bias = False)
        self.layer_1_1 = nn.Linear(5,32)
        self.layer_1_2 = nn.Linear(32,32)
        
        self.layer_2_1 = nn.Linear(10,32)
        self.layer_2_2 = nn.Linear(32,32)
        
        self.layer_1 = nn.Linear(2,8)
        
        self.result_layer = nn.Linear(32+32+8,256)
        self.result_layer_2 = nn.Linear(256,self.action_dim)
        self.value = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)

    def get_action(self,x, softmax_dim = 0):
        if softmax_dim == 1:
            floor_state = x.narrow(1,0,5) #.reshape(-1,5,1).permute(0,2,1)
            elevator_state = x.narrow(1,5,10) #.reshape(-1,10,1).permute(0,2,1)
            elevator_additional_state = x.narrow(1,15,2)
        else:
            
            floor_state = x.narrow(0,0,5)
            elevator_state = x.narrow(0,5,10)
            elevator_additional_state = x.narrow(0,15,2)
            #print(x)
            #print(floor_state)
            #print(elevator_state)
            #print(elevator_additional_state)
            
        x_1 = F.relu(self.layer_1_1(floor_state)) #F.relu
        x_1 = F.relu(self.layer_1_2(x_1))
        #x_1 = torch.flatten(x_1,start_dim=1)

        x_2 = F.relu(self.layer_2_1(elevator_state))
        x_2 = F.relu(self.layer_2_2(x_2))
        #x_2 = torch.flatten(x_2,start_dim=1)

        x_3 = F.relu(self.layer_1(elevator_additional_state))
        
        x = torch.cat((x_1,x_2,x_3),softmax_dim)
        
        #print(softmax_dim, " : ", x.shape)
        x = F.relu(self.result_layer(x))
        x = self.result_layer_2(x)
        
        x = F.softmax(x, dim = softmax_dim) # 하나만할때 0

        return x
    
    def get_value(self,x):
        floor_state = x.narrow(1,0,5) #.reshape(-1,5,1).permute(0,2,1)
        elevator_state = x.narrow(1,5,10) #.reshape(-1,10,1).permute(0,2,1)
        elevator_additional_state = x.narrow(1,15,2)
        
        x_1 = F.relu(self.layer_1_1(floor_state)) #F.relu
        x_1 = F.relu(self.layer_1_2(x_1))
        #x_1 = torch.flatten(x_1,start_dim=1)

        x_2 = F.relu(self.layer_2_1(elevator_state))
        x_2 = F.relu(self.layer_2_2(x_2))
        #x_2 = torch.flatten(x_2,start_dim=1)

        x_3 = F.relu(self.layer_1(elevator_additional_state))

        x = torch.cat((x_1,x_2,x_3), 1)
        
        x = F.relu(self.result_layer(x))
        x = self.value(x)
        return x
    
    def put_data(self,data):
        self.memory.append(data)
        
    def make_batch(self):
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],[],[],[],[],[]
        for data in self.memory:
            state,action,reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            prob_list.append([prob])
            next_state_list.append(next_state)
            done_mask = 0 if done else 1
            done_list.append([done_mask])
        self.memory = []
        
        s,a,r,next_s,done_mask,prob = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),torch.tensor(reward_list),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(done_list,dtype = torch.float),\
                                        torch.tensor(prob_list)
        return s,a,r,next_s,done_mask,prob
    
    def train(self):
        state,action,reward, next_state,done_mask,action_prob = self.make_batch()

        for i in range(K_epoch):
            td_error = reward + gamma * self.get_value(next_state) * done_mask
            delta = td_error - self.get_value(state)
            delta = delta.detach().numpy()
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            advantage = torch.tensor(advantage_list,dtype = torch.float)
            
            
            now_action = self.get_action(state,softmax_dim = 1)
            now_action = now_action.gather(1,action)
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + F.smooth_l1_loss(self.get_value(state),td_error.detach())
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [13]:
import numpy as np
import os
from Building import Building
#from Agent import Agent
import time
#====================================================================================


#====================================================================================
#Building Setting
lift_num = 1
buliding_height = 5
max_people_in_floor = 8
max_people_in_elevator = 10

add_people_at_step = 25
add_people_prob = 0.8

#Create building with 4 elevators, height 10, max people 30 in each floor
building = Building(lift_num, buliding_height, max_people_in_floor,max_people_in_elevator)

#Agent controls each elevator
#agent = Agent(buliding_height, lift_num, 4)
#agent.reload(280)
#The goal is to bring down all the people in the building to the ground floor

epochs = 1000
max_steps = 100
global_step = 0
T_horizon = 20
reward_list = []
print_interval = 20

In [14]:
epochs = 10000

In [15]:
model = Agent((buliding_height)+ max_people_in_elevator + (lift_num *2),4,learning_rate)
print_interval = 20
ave_reward = 0 

In [16]:



for epoch in range(epochs):
    building.empty_building()
    while building.target == 0 :
        building.generate_people(add_people_prob)
    state = building.get_state()
    done = False
    global_step = 0
    while not done:
        for t in range(T_horizon):
            global_step += 1
            if (global_step % 25 == 0) & global_step > 0 :
                #building.generate_people(add_people_prob/2)
                pass
            prev_people = building.get_arrived_people()
            action_prob = model.get_action(torch.from_numpy(np.array(state)).float())
            m = Categorical(action_prob)
            action = m.sample().item()
            building.perform_action([action])
            reward = building.get_reward(prev_people) 
            
            next_state = building.get_state()
            finished = next_state.copy()
            del finished[-2]
            #print(action)
            #print('next_state',next_state)
            #print('finished',finished)
            if (sum(finished) == 0.0) :
                reward = 100.
                done = True
                
            model.put_data((state, action, reward/10., next_state, action_prob[action].item(), done))
            state = next_state
            if done or global_step > 300:
                done = True
                break
        #state,action,reward, next_state,done_mask,action_prob = model.make_batch()
        #raise Exception()
        model.train()
        #raise Exception()
    ave_reward += global_step 
    #print("Epoch: %d Step: %d Average Reward: %.4f"%(epoch, global_step, ave_reward/global_step))
    if epoch%print_interval==0 and epoch!=0:
        print("# of episode :{}, avg score : {:.1f}".format(epoch, ave_reward/print_interval))
        ave_reward = 0
    if (epoch % 100 == 0 )& (epoch != 0):
        torch.save(model.state_dict(), './model_weights/seperate_model_'+str(epoch))
    reward_list.append(global_step)

# of episode :20, avg score : 174.2
# of episode :40, avg score : 301.0
# of episode :60, avg score : 301.0
# of episode :80, avg score : 301.0
# of episode :100, avg score : 301.0
# of episode :120, avg score : 301.0
# of episode :140, avg score : 301.0
# of episode :160, avg score : 301.0
# of episode :180, avg score : 301.0
# of episode :200, avg score : 301.0
# of episode :220, avg score : 301.0
# of episode :240, avg score : 301.0
# of episode :260, avg score : 301.0
# of episode :280, avg score : 301.0
# of episode :300, avg score : 301.0
# of episode :320, avg score : 301.0
# of episode :340, avg score : 301.0
# of episode :360, avg score : 301.0
# of episode :380, avg score : 301.0
# of episode :400, avg score : 301.0
# of episode :420, avg score : 301.0
# of episode :440, avg score : 301.0
# of episode :460, avg score : 301.0
# of episode :480, avg score : 301.0
# of episode :500, avg score : 301.0


KeyboardInterrupt: 

In [None]:
'''
되는놈
#Hyperparameters
learning_rate = 0.0001
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20
class Agent(nn.Module):
    def __init__(self, state_dim,action_dim,learning_rate):
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        self.memory = []

        '''
        self.fc1 = nn.Linear(self.state_dim,256)
        self.policy = nn.Linear(256, self.action_dim)
        self.value = nn.Linear(256, 1)
        '''


        #self.conv_layer_1_1 = nn.Conv1d(in_channels = 1,out_channels = 8, kernel_size = 1, stride = 1,padding = 0,bias = False)
        #self.conv_layer_pool_1_1 = nn.MaxPool1d(8)
        #self.conv_layer_1_2 = nn.Conv1d(in_channels = 8,out_channels = 8, kernel_size = 1, stride = 1,padding = 0,bias = False)
        
        #self.conv_layer_2_1 = nn.Conv1d(in_channels = 1,out_channels = 8, kernel_size = 1, stride = 1,padding = 0,bias = False)
        #self.conv_layer_pool_2_1 = nn.MaxPool1d(8)
        #self.conv_layer_2_2 = nn.Conv1d(in_channels = 8,out_channels = 8, kernel_size = 1, stride = 1,padding = 0,bias = False)
        self.layer_1_1 = nn.Linear(5,128)
        self.layer_1_2 = nn.Linear(128,16)
        
        self.layer_2_1 = nn.Linear(10,128)
        self.layer_2_2 = nn.Linear(128,16)
        
        self.layer_1 = nn.Linear(2,4)
        
        self.result_layer = nn.Linear(16+16+4,256)
        self.result_layer_2 = nn.Linear(256,self.action_dim)
        self.value = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)

    def get_action(self,x, softmax_dim = 0):
        if softmax_dim == 1:
            floor_state = x.narrow(1,0,5) #.reshape(-1,5,1).permute(0,2,1)
            elevator_state = x.narrow(1,5,10) #.reshape(-1,10,1).permute(0,2,1)
            elevator_additional_state = x.narrow(1,15,2)
        else:
            
            floor_state = x.narrow(0,0,5)
            elevator_state = x.narrow(0,5,10)
            elevator_additional_state = x.narrow(0,15,2)
            #print(x)
            #print(floor_state)
            #print(elevator_state)
            #print(elevator_additional_state)
            
        x_1 = F.relu(self.layer_1_1(floor_state)) #F.relu
        x_1 = F.relu(self.layer_1_2(x_1))
        #x_1 = torch.flatten(x_1,start_dim=1)

        x_2 = F.relu(self.layer_2_1(elevator_state))
        x_2 = F.relu(self.layer_2_2(x_2))
        #x_2 = torch.flatten(x_2,start_dim=1)

        x_3 = F.relu(self.layer_1(elevator_additional_state))
        
        x = torch.cat((x_1,x_2,x_3),softmax_dim)
        
        #print(softmax_dim, " : ", x.shape)
        x = F.relu(self.result_layer(x))
        x = self.result_layer_2(x)
        
        x = F.softmax(x, dim = softmax_dim) # 하나만할때 0

        return x
    
    def get_value(self,x):
        floor_state = x.narrow(1,0,5) #.reshape(-1,5,1).permute(0,2,1)
        elevator_state = x.narrow(1,5,10) #.reshape(-1,10,1).permute(0,2,1)
        elevator_additional_state = x.narrow(1,15,2)
        
        x_1 = F.relu(self.layer_1_1(floor_state)) #F.relu
        x_1 = F.relu(self.layer_1_2(x_1))
        #x_1 = torch.flatten(x_1,start_dim=1)

        x_2 = F.relu(self.layer_2_1(elevator_state))
        x_2 = F.relu(self.layer_2_2(x_2))
        #x_2 = torch.flatten(x_2,start_dim=1)

        x_3 = F.relu(self.layer_1(elevator_additional_state))

        x = torch.cat((x_1,x_2,x_3), 1)
        
        x = F.relu(self.result_layer(x))
        x = self.value(x)
        return x
    
    def put_data(self,data):
        self.memory.append(data)
        
    def make_batch(self):
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],[],[],[],[],[]
        for data in self.memory:
            state,action,reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            prob_list.append([prob])
            next_state_list.append(next_state)
            done_mask = 0 if done else 1
            done_list.append([done_mask])
        self.memory = []
        
        s,a,r,next_s,done_mask,prob = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),torch.tensor(reward_list),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(done_list,dtype = torch.float),\
                                        torch.tensor(prob_list)
        return s,a,r,next_s,done_mask,prob
    
    def train(self):
        state,action,reward, next_state,done_mask,action_prob = self.make_batch()

        for i in range(K_epoch):
            td_error = reward + gamma * self.get_value(next_state) * done_mask
            delta = td_error - self.get_value(state)
            delta = delta.detach().numpy()
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            advantage = torch.tensor(advantage_list,dtype = torch.float)
            
            
            now_action = self.get_action(state,softmax_dim = 1)
            now_action = now_action.gather(1,action)
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + F.smooth_l1_loss(self.get_value(state),td_error.detach())
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
'''