In [1]:
import numpy as np
from ortools.linear_solver import pywraplp
import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print
import gym
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.env import BaseEnv
from ray.rllib.evaluation import Episode, RolloutWorker
from ray.rllib.utils.typing import AgentID, PolicyID
from typing import Dict, Optional, TYPE_CHECKING
from ray.rllib.policy import Policy

In [2]:
m = 7 # num vars
n = 3  # num constraints
ubound = 1 # upper bound of constraints

In [3]:
# data simpling
rand = np.random.RandomState(3)
p = np.round(rand.random_sample(m)*5,1) # goal koef
c = np.round(rand.random_sample((n,m))*10 * (rand.random_sample(m)*(p/5)*0.3 + 1),1) # constrants
b = np.round(c.sum(axis = 1) * (rand.random_sample(n) * 0.5 + 0.3),0)

In [4]:
solver = pywraplp.Solver.CreateSolver('SCIP')

In [5]:
x = {}
for j in range(m):
    x[j] = solver.IntVar(0, ubound, f"x[{j}]")

In [6]:
for i in range(n):
    constraint_expr = [c[i,j] * x[j] for j in range(m)]
    solver.Add(sum(constraint_expr) <= b[i])

In [7]:
obj_expr = [p[j] * x[j] for j in range(m)]
solver.Maximize(solver.Sum(obj_expr))

In [8]:
#solver.EnableOutput()

In [9]:
status = solver.Solve()

In [10]:
if status == pywraplp.Solver.OPTIMAL:
    print('Objective value =', solver.Objective().Value())
    for j in range(m):
        print(x[j].name(), ' = ', x[j].solution_value())
    print()
    print(f"Problem solved in {solver.wall_time()} milliseconds")
    print(f"Problem solved in {solver.iterations()} iterations")
    print(f"Problem solved in {solver.nodes()} branch-and-bound nodes")
else:
    print("The problem does not have an optimal solution.")

Objective value = 10.600000000000001
x[0]  =  0.0
x[1]  =  1.0
x[2]  =  0.0
x[3]  =  1.0
x[4]  =  1.0
x[5]  =  0.0
x[6]  =  0.0

Problem solved in 77 milliseconds
Problem solved in 7 iterations
Problem solved in 1 branch-and-bound nodes


In [11]:
class MyEnv(gym.Env):
    def __init__(self, env_config):
        self.action_space = gym.spaces.Discrete(ubound + 1)
        self.observation_space = gym.spaces.Dict({
            'rem': gym.spaces.Box(low=np.zeros(n), high=b, dtype=np.float64), 
            'j': gym.spaces.Discrete(m) #, 
            #'x': gym.spaces.Tuple([gym.spaces.Discrete(ubound + 1)]*m)
        })
        self.state = {'rem': np.array(b), 'j': 0} # , 'x': [0]*m
        self.done = False

    def reset(self):
        self.state = {'rem': np.array(b), 'j': 0} #, 'x': [0]*m
        self.done = False
        return self.state

    def step(self, action):
        # print('current state:', self.state)   
        # print('action taken:', action)
        j = self.state['j']
        rem = self.state['rem'] - c[:,j] * action
        if np.any(rem < 0):
            self.reward = -1
        else:
            self.reward = action * p[j]
            # self.state['x'][j] = action 
            self.state['rem'] = rem
            j += 1
            if j == m: 
                self.done = True
            else:
                self.state['j'] = j
                self.done = False
        # print('reward:', self.reward)
        # print('next state:', self.state)    
        return self.state, self.reward, self.done, {}

In [12]:
ray.shutdown()
ray.init()

{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': '127.0.0.1:45322',
 'object_store_address': '/tmp/ray/session_2022-04-02_17-54-38_713903_7998/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-04-02_17-54-38_713903_7998/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-04-02_17-54-38_713903_7998',
 'metrics_export_port': 65364,
 'gcs_address': '127.0.0.1:63152',
 'node_id': '8358b9ab4d5adf88d9a9fc3d37597897ca71818d5795e41c980a6417'}

In [13]:
class SampleCallback(DefaultCallbacks):
    
    def __init__(self, legacy_callbacks_dict: Dict[str, callable] = None):
        self.best_reward = -666666666
        self.best_actions = []
        self.legacy_callbacks = legacy_callbacks_dict or {}
        
    def on_postprocess_trajectory(
            self, *, worker: "RolloutWorker", episode: Episode,
            agent_id: AgentID, policy_id: PolicyID,
            policies: Dict[PolicyID, Policy], postprocessed_batch: SampleBatch,
            original_batches: Dict[AgentID, SampleBatch], **kwargs) -> None:
        
        sample_obj = original_batches[agent_id][1]
        rewards = sample_obj.columns(['rewards'])[0]
        total_reward = np.sum(rewards)
        actions = sample_obj.columns(['actions'])[0]
        
        if total_reward > self.best_reward and np.all(rewards >= 0):
            self.best_reward = total_reward
            self.best_actions = actions
            episode.hist_data["best_reward"] = [total_reward]
            episode.hist_data["best_actions"] = [actions]

In [24]:
config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 1
config["framework"] = "torch"
config["env_config"] = {}
#config['kl_coeff'] = 0.0
config["callbacks"] = SampleCallback
config["log_level"] = "ERROR"

In [15]:
from stable_baselines3.common.env_checker import check_env
env = MyEnv(config)
check_env(env)

In [16]:
from stable_baselines3 import PPO

env = MyEnv(config)

model = PPO("MultiInputPolicy", env, verbose=0)
model.learn(total_timesteps=10)

<stable_baselines3.ppo.ppo.PPO at 0x130a4fe50>

In [17]:
obs = env.reset()
g = 0
for i in range(m):
    action, _states = model.predict(obs, deterministic=True)
    print('action: ', action)
    obs, reward, done, info = env.step(action)
    print('obs: ', obs, 'reward: ', reward)
    g += reward
    #env.render()
    if done:
      print('done. g = ', g)
      obs = env.reset()

env.close()

action:  1
obs:  {'rem': array([ 5.8, 12.8, 13.6]), 'j': 1} reward:  2.8
action:  1
obs:  {'rem': array([5.2, 5.7, 8.3]), 'j': 2} reward:  3.5
action:  0
obs:  {'rem': array([5.2, 5.7, 8.3]), 'j': 3} reward:  0.0
action:  0
obs:  {'rem': array([5.2, 5.7, 8.3]), 'j': 4} reward:  0.0
action:  0
obs:  {'rem': array([5.2, 5.7, 8.3]), 'j': 5} reward:  0.0
action:  0
obs:  {'rem': array([5.2, 5.7, 8.3]), 'j': 6} reward:  0.0
action:  0
obs:  {'rem': array([5.2, 5.7, 8.3]), 'j': 6} reward:  0.0
done. g =  6.3


In [28]:
agent = ppo.PPOTrainer(config=config, env=MyEnv)



In [29]:
best_g = 0
best_actions = []
for i in range(21):
    # Perform one iteration of training the policy with PPO
    result = agent.train()
    if 'best_reward' in result['hist_stats'] and len(result['hist_stats']['best_reward']) > 0 and \
        ( best_g < result['hist_stats']['best_reward'][-1] or best_actions == []):
        best_g = result['hist_stats']['best_reward'][-1]
        best_actions = result['hist_stats']['best_actions'][-1]
    if i % 10 == 0:
        #print(pretty_print(result))
        print('i: ', i)
        print('mean episode length:', result['episode_len_mean'])
        print('max episode reward:', result['episode_reward_max'])
        print('mean episode reward:', result['episode_reward_mean'])
        print('min episode reward:', result['episode_reward_min'])
        print('total episodes:', result['episodes_total'])
        print('solution:', best_g, best_actions)
        checkpoint = agent.save()
        #print("checkpoint saved at", checkpoint)

i:  0
mean episode length: 9.516666666666667
max episode reward: 10.6
mean episode reward: 3.3371428571428567
min episode reward: -6.7
total episodes: 420
solution: 10.6 [0 1 0 1 1 0 0]
i:  10
mean episode length: 7.019298245614035
max episode reward: 10.6
mean episode reward: 10.305438596491227
min episode reward: 3.2
total episodes: 5902
solution: 10.6 [0 1 0 1 1 0 0]
i:  20
mean episode length: 7.0
max episode reward: 10.6
mean episode reward: 10.589842381786339
min episode reward: 7.6
total episodes: 11614
solution: 10.6 [0 1 0 1 1 0 0]


In [20]:
""" no need agent
env = MyEnv(config)
state = env.reset()
g = 0
done = False
reward = 0
while not done:
  action = agent.compute_action(state, explore = False)
  print(f"j = {state['j']} action = {action} reward = {reward}")
  state, reward, done, info = env.step(action)
  g += reward
print(g) """

' no need agent\nenv = MyEnv(config)\nstate = env.reset()\ng = 0\ndone = False\nreward = 0\nwhile not done:\n  action = agent.compute_action(state, explore = False)\n  print(f"j = {state[\'j\']} action = {action} reward = {reward}")\n  state, reward, done, info = env.step(action)\n  g += reward\nprint(g) '

In [21]:
for j, x in enumerate(best_actions):
    print(f"j = {j} action = {x} reward = {x*p[j]}")
print(best_g)

j = 0 action = 0 reward = 0.0
j = 1 action = 1 reward = 3.5
j = 2 action = 0 reward = 0.0
j = 3 action = 1 reward = 2.6
j = 4 action = 1 reward = 4.5
j = 5 action = 0 reward = 0.0
j = 6 action = 0 reward = 0.0
10.6
