In [9]:
from keras.models import Sequential
from keras.optimizers import sgd
import numpy as np
import keras
import math
import json
from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input, LSTM, Masking, MaxPooling1D, concatenate

In [14]:
class Env(object):
    def __init__(self):
        self.n_cust=3
        self.depot=[]
        self.capacity = 15               #vehicle capacity
        self.input_dim = 3              #x,y,demand
        self.input_data = np.zeros((self.n_cust,self.input_dim))
        self.demand = self.input_data[:,-1]
        self.path=[self.depot]
        self.reward=0
        self.was_zero=False
        self.total_reward=0
        
    def create_dataset(self):
        rnd=np.random
        x = rnd.uniform(0,1,size=(self.n_cust,2))
        d = rnd.randint(1,10,[self.n_cust,1])
        self.depot=[rnd.uniform(0,1),rnd.uniform(0,1)]
        self.input_data = np.concatenate([x,d],1)
        print(self.depot)
        print(self.input_data)
        
    def reset(self):
        self.create_dataset()
        self.path=[self.depot]
        self.reward=0
        
    def update_state(self,cust_id):
        if self.input_data[cust_id,-1]==0:
            was_zero=True
            return
        if self.capacity>self.input_data[cust_id,-1]:
            self.input_data[cust_id,-1]=0
            self.capacity=self.capacity-self.input_data[cust_id,-1]
        else:
            self.input_data[cust_id,-1]=self.input_data[cust_id,-1]-self.capacity
            self.capacity=max(0,self.capacity-self.input_data[cust_id,-1])
            
    def is_over(self):
        return not self.input_data[:,-1].any()
    
    def observe(self):
        return self.input_data.copy(), self.path.copy()
    
    def act(self, cust_id):
        self.update_state(action)
        self.path.append(self.input_data[cust_id,:-1].tolist())
        reward = self.get_reward(cust_id)
        if self.capacity==0 and self.input_data[:,-1].any():
            self.path.append([depot])
            reward = self.get_reward(cust_id)
        game_over = self.is_over()
        return self.input_data.copy(), self.path.copy(), reward, game_over
    
    def get_reward(self, cust_id):
        
        if self.was_zero:
            self.was_zero=False
            return -100
        
        dist = float(math.sqrt( ((self.path[-2][0]-self.path[-1][0])**2) + ((self.path[-2][1]-self.path[-1][1])**2) ))
        if self.is_over():
            dist=float(dist+math.sqrt( ((self.path[-2][0]-self.path[-1][0])**2) + ((self.path[-2][1]-self.path[-1][1])**2) ))
        
        return -dist

In [15]:
class Experience(object):
    def __init__(self, discount=.9): 
        self.memory = list()
        self.discount = discount
        
    def remember(self, states, game_over):
        self.memory.append([states, game_over])
            
    def get_batch(self, model, batch_size=10):
        len_memory = len(self.memory)
        num_actions = model.output_shape[-1]
        env_dim = self.memory[0][0][0]
        path_dim = self.memory[0][0][1]
        inputs = np.zeros((min(len_memory, batch_size), env_dim.shape[0],env_dim.shape[1]))
        paths=[]
        targets = np.zeros((inputs.shape[0], num_actions))
        for i, idx in enumerate(np.random.randint(0, len_memory,
                                                  size=inputs.shape[0])):
            input_state_t, path_state_t, action_t, reward_t, input_state_tp1, path_state_tp1 = self.memory[idx][0]
            game_over = self.memory[idx][1]
            inputs[i]=input_state_t
            paths.append(path_state_t)
            targets[i]=(model.predict([np.expand_dims(input_t, axis=0), np.expand_dims(path_t, axis=1)])[0])
            Q_sa = np.max(model.predict([np.expand_dims(input_state_tp1, axis=0), np.expand_dims(path_state_tp1, axis=1)]))
            
            if game_over:  # if game_over is True
                targets[i,action_t] = reward_t
                print("reward_when_over:",reward_t)
            else:
                targets[i,action_t] = reward_t + self.discount * Q_sa
         
        return inputs, paths, targets

In [16]:
def padded(paths):
    length=1
    for p in range(len(paths)):
        if paths[p].shape[0]>length: length=paths[p].shape[0]

    
    for p in range(len(paths)):
        for l in range(length-paths[p].shape[0]):
            paths[p]=(np.append(paths[p],[[0,0]],axis=0))
            
    return paths

In [None]:
if __name__ == "__main__":
    # parameters
    epsilon = .1  # exploration
    num_actions = 3  #n_cust
    epoch = 5000
    #max_memory = 500
    hidden_size = 100
    batch_size = 50
    mem=[]
    env=Env()
    num_actions = env.n_cust #we need idx
    m1=mem.copy()
    total_reward=0
    
    actor_in=Input(shape=(3,3)) 
    actor=Conv1D(128,1) (actor_in)
    actor=Flatten() (actor)
    actor_c=Dense(128) (actor)
    
    decoder_in=Input(shape=(None,2))
    decoder=Masking(mask_value=[0,0]) (decoder_in)
    decoder_out=LSTM(128, dropout=0.1) (decoder)
    
    actor=concatenate([actor_c, decoder_out])
    actor=Activation('tanh') (actor)
    actor_out=Dense(num_actions, activation='softmax') (actor)
    
    critic_in=concatenate([actor_c, actor_out])
    critic=concatenate([actor_c, critic_in])
    critic=Activation('tanh') (critic)
    critic_out=Dense(num_actions, activation='softmax') (critic)
    
    model = keras.models.Model(input=[actor_in,decoder_in], output=critic_out)
    model.compile(sgd(lr=0.0001), loss="mse")

    
    exp=Experience()
    
    for e in range(epoch):
        loss = 0.
        env.reset()
        game_over = False
        # get initial input
        input_t, path_t = env.observe()

        while not game_over:
            input_tm1 = np.array(input_t)
            path_tm1 = np.array(path_t)
            # get next action
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, num_actions)
            else:
                q = model.predict([np.expand_dims(input_tm1, axis=0), np.expand_dims(path_tm1, axis=1)])
                action = np.argmax(q[0])
                
            # apply action, get rewards and new state
            input_t, path_t, reward, game_over = env.act(action)
            total_reward=total_reward+reward
            
            # store experience
            exp.remember([input_tm1, path_tm1, action, reward, input_t, path_t], game_over)
            
            # adapt model
            inputs, paths, targets = exp.get_batch(model, batch_size=batch_size)
             
            paths=padded(paths)
             
            loss += model.train_on_batch([inputs, np.array(paths)], targets)
            
        print("Epoch {:03d}/999 | Loss {:.4f} | Path_length {}".format(e, loss, total_reward))
        total_reward=0

    # Save trained model weights and architecture, this will be used by the visualization code
    model.save_weights("model.h5", overwrite=True)
    with open("model.json", "w") as outfile:
        json.dump(model.to_json(), outfile)

