In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import gym
import ray.rllib.agents.ppo as ppo
import random
# import jdc
import pandas as pd

from gym import Env, spaces
import time

#get a matrix of distances between locations
df = pd.read_csv('dist_vologda_matrix.csv', sep=',')
df = df.iloc[:, 1:]

#some parameters
working_hours = 12.0
truck_speed = 60.0

df = df / 1000.0

In [2]:
#remove locations, which cannot be served in 12 hours (driver can't get there and come back in 12 hours)
to_remove = ((2 * (df.values[-1,:] / truck_speed) < working_hours) == False).nonzero()[0]
df = df.drop(df.index[to_remove])
df = df.drop(df.columns[to_remove], axis=1)
df = df.reset_index(drop=True)
N = len(df.values[0])

In [3]:
df = df.iloc[[0,1,2,3,4,5,55],[0,1,2,3,4,5,55]]

In [4]:
N = len(df.values[0])
N

7

In [5]:
#custom gym environment for our problem
class TransportScape(Env):
    def __init__(self, env_config):
        self.num_of_trucks = N - 1
        self.num_of_locations = N
        self.h = working_hours #working hours of each truck driver
        self.speed = truck_speed
        self.distances = df.values #an array, containing distances between locations
    
        self.action_space = spaces.MultiDiscrete([self.num_of_trucks, self.num_of_locations - 1])
    
        self.observation_space = spaces.Dict(
        {
            #x_i - shows each truck's location
            'truck location': spaces.MultiDiscrete(np.full(self.num_of_trucks, self.num_of_locations)),
            #p_j - shows whether a truck is assigned to this particular location or not
            'assignment': spaces.MultiBinary(self.num_of_locations - 1), 
            #u_i - shows whether a particular truck was used before
            'truck usage': spaces.MultiBinary(self.num_of_trucks),
            #delta_i - shows how much time each truck has left until the end of the work day
            'time left': spaces.Box(low=np.zeros(self.num_of_trucks), high=np.full(self.num_of_trucks, self.h), dtype=np.float64)
        })
        self.reset()
        
    def nice_print(self):
        print("Truck Location")
        print(self.state['truck location'])
        print("Assignment")
        print(self.state['assignment'])
        print("Truck Usage")
        print(self.state['truck usage'])
        print("Time Left")
        print(self.state['time left'])
        
    def reset(self):
        self.state = {'truck location': np.full(self.num_of_trucks, self.num_of_locations - 1),
                      'assignment': np.zeros(self.num_of_locations - 1),
                      'truck usage': np.zeros(self.num_of_trucks),
                      'time left': np.full(self.num_of_trucks, self.h)}
        self.done = False
        return self.state 
    
    def step(self, action):
        i, j = action[0], action[1]
        #print(i, j)
        truck_loc = self.state['truck location']
        assignment = self.state['assignment']
        truck_use = self.state['truck usage']
        time_left = self.state['time left']
        
        #if a truck is already assigned to this location - penalty
        if assignment[j] == 1:
            #print("if location was visited")
            self.reward = -5000
        #if no time left - penalty
        elif (time_left[i] - self.distances[truck_loc[i], j] / self.speed - self.distances[j, -1] / self.speed) < 0:
            #print("if no time left")
            self.reward = -5000
        #else - assign truck to a new location
        else:
            self.reward = 0
            #print("else")
            #if needs to use an unused before truck - penalty
            if truck_use[i] == 0:
                #print("new truck requested")
                self.reward = -2000
                truck_use[i] = 1
            assignment[j] = 1
            self.reward -= self.distances[truck_loc[i], j]
            time_left[i] -= self.distances[truck_loc[i], j] / self.speed
            truck_loc[i] = j
        #if all locations have trucks assigned to them - end episode
        if np.all(assignment == 1):
            #print("if all locations are visited")
            self.done = True
            
        else:
            self.done = False
        
        self.state['truck location'] = truck_loc
        self.state['assignment'] = assignment
        self.state['truck usage'] = truck_use
        self.state['time left'] = time_left
        
        #self.nice_print()
        
        # print(self.state, self.reward, self.done)
        
        return self.state, self.reward, self.done, {}

In [6]:
import ray
import ray.rllib.agents.ppo as ppo

In [7]:
config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 1
config["framework"] = "torch"
config["env_config"] = {}
config['kl_coeff'] = 0.0
config["log_level"] = "ERROR"

In [8]:
ray.shutdown()
ray.init()

RayContext(dashboard_url='', python_version='3.9.11', ray_version='1.12.0', ray_commit='f18fc31c7562990955556899090f8e8656b48d2d', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-05-04_20-38-29_888144_33722/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-05-04_20-38-29_888144_33722/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-05-04_20-38-29_888144_33722', 'metrics_export_port': 63209, 'gcs_address': '127.0.0.1:59400', 'address': '127.0.0.1:59400', 'node_id': '4944a2d940a2a8c388bb89e2df067be3bf9fd708b750de12f29302b0'})

In [9]:
agent = ppo.PPOTrainer(config=config, env=TransportScape)

2022-05-04 20:38:35,046	INFO ppo.py:268 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-05-04 20:38:35,047	INFO trainer.py:864 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


In [10]:
for i in range(21):
    # Perform one iteration of training the policy with PPO
    result = agent.train()
    if i % 10 == 0:
        print('i: ', i)
        print('mean episode length:', result['episode_len_mean'])
        print('max episode reward:', result['episode_reward_max'])
        print('mean episode reward:', result['episode_reward_mean'])
        print('min episode reward:', result['episode_reward_min'])
        print('total episodes:', result['episodes_total'])
        checkpoint = agent.save()
        #print("checkpoint saved at", checkpoint)

i:  0
mean episode length: 17.932735426008968
max episode reward: -9447.045
mean episode reward: -69712.29721569506
min episode reward: -376280.8533
total episodes: 223
i:  10
mean episode length: 6.777966101694915
max episode reward: -7021.0893
mean episode reward: -14650.28132677966
min episode reward: -54593.1216
total episodes: 3975
i:  20
mean episode length: 6.072948328267477
max episode reward: -6938.378699999999
mean episode reward: -10295.9296056231
min episode reward: -39239.7306
total episodes: 10445


In [11]:
env = TransportScape(config)
state = env.reset()
g = 0
done = False
reward = 0
while not done:
  action = agent.compute_action(state, explore = False)
  print(f"state = {state} action = {action} reward = {reward}")
  state, reward, done, info = env.step(action)
  g += reward
print(g) 

state = {'truck location': array([6, 6, 6, 6, 6, 6]), 'assignment': array([0., 0., 0., 0., 0., 0.]), 'truck usage': array([0., 0., 0., 0., 0., 0.]), 'time left': array([12., 12., 12., 12., 12., 12.])} action = [3 4] reward = 0
state = {'truck location': array([6, 6, 6, 4, 6, 6]), 'assignment': array([0., 0., 0., 0., 1., 0.]), 'truck usage': array([0., 0., 0., 1., 0., 0.]), 'time left': array([12.        , 12.        , 12.        , 11.26389667, 12.        ,
       12.        ])} action = [2 3] reward = -2044.1662
state = {'truck location': array([6, 6, 3, 4, 6, 6]), 'assignment': array([0., 0., 0., 1., 1., 0.]), 'truck usage': array([0., 0., 1., 1., 0., 0.]), 'time left': array([12.        , 12.        , 11.22030333, 11.26389667, 12.        ,
       12.        ])} action = [2 2] reward = -2046.7818
state = {'truck location': array([6, 6, 2, 4, 6, 6]), 'assignment': array([0., 0., 1., 1., 1., 0.]), 'truck usage': array([0., 0., 1., 1., 0., 0.]), 'time left': array([12.        , 12.      