In [1]:
import numpy as np
from ortools.linear_solver import pywraplp
import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print
import gym

In [2]:
m = 7 # num vars
n = 3  # num constraints
ubound = 1 # upper bound of constraints

In [3]:
# data simpling
rand = np.random.RandomState(3)
p = np.round(rand.random_sample(m)*5,1) # goal koef
c = np.round(rand.random_sample((n,m))*10 * (rand.random_sample(m)*(p/5)*0.3 + 1),1) # constrants
b = np.round(c.sum(axis = 1) * (rand.random_sample(n) * 0.5 + 0.3),0)

In [4]:
solver = pywraplp.Solver.CreateSolver('SCIP')

In [5]:
x = {}
for j in range(m):
    x[j] = solver.IntVar(0, ubound, f"x[{j}]")

In [6]:
for i in range(n):
    constraint_expr = [c[i,j] * x[j] for j in range(m)]
    solver.Add(sum(constraint_expr) <= b[i])

In [7]:
obj_expr = [p[j] * x[j] for j in range(m)]
solver.Maximize(solver.Sum(obj_expr))

In [8]:
#solver.EnableOutput()

In [9]:
status = solver.Solve()

In [10]:
if status == pywraplp.Solver.OPTIMAL:
    print('Objective value =', solver.Objective().Value())
    for j in range(m):
        print(x[j].name(), ' = ', x[j].solution_value())
    print()
    print(f"Problem solved in {solver.wall_time()} milliseconds")
    print(f"Problem solved in {solver.iterations()} iterations")
    print(f"Problem solved in {solver.nodes()} branch-and-bound nodes")
else:
    print("The problem does not have an optimal solution.")

Objective value = 10.600000000000001
x[0]  =  0.0
x[1]  =  1.0
x[2]  =  0.0
x[3]  =  1.0
x[4]  =  1.0
x[5]  =  0.0
x[6]  =  0.0

Problem solved in 119 milliseconds
Problem solved in 7 iterations
Problem solved in 1 branch-and-bound nodes


In [11]:
ray.shutdown()
ray.init()

{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': '127.0.0.1:31357',
 'object_store_address': '/tmp/ray/session_2022-03-08_23-50-58_472144_7006/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-03-08_23-50-58_472144_7006/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-03-08_23-50-58_472144_7006',
 'metrics_export_port': 63347,
 'gcs_address': '127.0.0.1:56774',
 'node_id': '4999ca31ed47f53247f8f6e2e18a26b41300238b0e7bac4de08f9491'}

In [12]:
config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 1
config["framework"] = "torch"
config["env_config"] = {}
#config['kl_coeff'] = 0.0

In [24]:
class MyEnv(gym.Env):
    def __init__(self, env_config):
        self.action_space = gym.spaces.Discrete(ubound + 1)
        self.observation_space = gym.spaces.Dict({
            'rem': gym.spaces.Box(low=np.zeros(n), high=b, dtype=np.float64), 
            'j': gym.spaces.Discrete(m + 1)})
        self.state = {'rem': np.array(b), 'j': 0}
        self.done = False

    def reset(self):
        self.state = {'rem': np.array(b), 'j': 0}
        self.done = False
        return self.state

    def step(self, action):
        # print('current state:', self.state)   
        # print('action taken:', action)
        j = self.state['j']
        rem = self.state['rem'] - c[:,j] * action
        if np.any(rem < 0):
            self.reward = -1
        else:
            self.reward = action * p[j]
            j += 1
            self.state = {'rem': rem, 'j': j}
            
        # print('reward:', self.reward)
        # print('next state:', self.state)
        
        if j == m: 
            self.done = True
        else:
            self.done = False
            
        return self.state, self.reward, self.done, {}

In [14]:
# from stable_baselines3.common.env_checker import check_env
# env = MyEnv(config)
# check_env(env)

In [26]:
from stable_baselines3 import PPO

env = MyEnv(config)

model = PPO("MultiInputPolicy", env, verbose=0)
model.learn(total_timesteps=50000)

<stable_baselines3.ppo.ppo.PPO at 0x1bf578e80>

In [27]:
obs = env.reset()
g = 0
for i in range(m):
    action, _states = model.predict(obs, deterministic=True)
    print('action: ', action)
    obs, reward, done, info = env.step(action)
    print('obs: ', obs, 'reward: ', reward)
    g += reward
    #env.render()
    if done:
      print('done. g = ', g)
      obs = env.reset()

env.close()

action:  0
obs:  {'rem': array([ 8., 20., 21.]), 'j': 1} reward:  0.0
action:  1
obs:  {'rem': array([ 7.4, 12.9, 15.7]), 'j': 2} reward:  3.5
action:  0
obs:  {'rem': array([ 7.4, 12.9, 15.7]), 'j': 3} reward:  0.0
action:  1
obs:  {'rem': array([7.1, 6.7, 9.7]), 'j': 4} reward:  2.6
action:  1
obs:  {'rem': array([1.4, 3.5, 0. ]), 'j': 5} reward:  4.5
action:  0
obs:  {'rem': array([1.4, 3.5, 0. ]), 'j': 6} reward:  0.0
action:  0
obs:  {'rem': array([1.4, 3.5, 0. ]), 'j': 7} reward:  0.0
done. g =  10.6


In [28]:
agent = ppo.PPOTrainer(config=config, env=MyEnv)



In [30]:
for i in range(51):
   # Perform one iteration of training the policy with PPO
   result = agent.train()
   if i % 10 == 0:
       #print(pretty_print(result))
       print('i: ', i)
       print('mean episode length:', result['episode_len_mean'])
       print('max episode reward:', result['episode_reward_max'])
       print('mean episode reward:', result['episode_reward_mean'])
       print('min episode reward:', result['episode_reward_min'])
       print('total episodes:', result['episodes_total'])
       print()

       checkpoint = agent.save()
       #print("checkpoint saved at", checkpoint)

i:  0
mean episode length: 9.41745283018868
max episode reward: 10.6
mean episode reward: 3.394339622641509
min episode reward: -8.7
total episodes: 424

i:  10
mean episode length: 7.007005253940456
max episode reward: 10.6
mean episode reward: 10.592994746059544
min episode reward: 9.6
total episodes: 5939

i:  20
mean episode length: 47.71
max episode reward: 8.6
mean episode reward: -30.11
min episode reward: -185.4
total episodes: 9523

i:  30
mean episode length: 7.060070671378092
max episode reward: 10.6
mean episode reward: 10.512190812720847
min episode reward: -3.3000000000000003
total episodes: 13711

i:  40
mean episode length: 7.0
max episode reward: 10.6
mean episode reward: 10.58951048951049
min episode reward: 7.6
total episodes: 19374

i:  50
mean episode length: 7.0
max episode reward: 10.6
mean episode reward: 10.593169877408053
min episode reward: 6.699999999999999
total episodes: 24211



In [32]:
env = MyEnv(config)
state = env.reset()
g = 0
done = False
reward = 0
while not done:
  action = agent.compute_action(state, explore = False)
  print(f"j = {state['j']} action = {action} reward = {reward}")
  state, reward, done, info = env.step(action)
  g += reward
print(g)

j = 0 action = 0 reward = 0
j = 1 action = 1 reward = 0.0
j = 2 action = 0 reward = 3.5
j = 3 action = 1 reward = 0.0
j = 4 action = 1 reward = 2.6
j = 5 action = 0 reward = 4.5
j = 6 action = 0 reward = 0.0
10.6


In [31]:
print("checkpoint saved at", checkpoint)

checkpoint saved at /Users/vladimirsudakov/ray_results/PPOTrainer_MyEnv_2022-03-09_00-06-46dqgq572a/checkpoint_000051/checkpoint-51
