# Random testing the trained model

### Importing the necessary libraries

In [None]:
import numpy as np
import gym
from gym import spaces, logger
from gym.utils import seeding
from or_gym.utils import assign_env_config
import copy
from stable_baselines3 import PPO
import numpy as np
from train import NormalizingWrapper
from stable_baselines3.common.evaluation import evaluate_policy

## Unbounded Knapsack Problem

    The Knapsack Problem (KP) is a combinatorial optimization problem which
    requires the user to select from a range of goods of different values and
    weights in order to maximize the value of the selected items within a 
    given weight limit. This version is unbounded meaning that we can select
    items without limit. 

    The episodes proceed by selecting items and placing them into the
    knapsack one at a time until the weight limit is reached or exceeded, at
    which point the episode ends.

    Observation:
        Type: Tuple, Discrete
        0: list of item weights
        1: list of item values
        2: maximum weight of the knapsack
        3: current weight in knapsack

    Actions:
        Type: Discrete
        0: Place item 0 into knapsack
        1: Place item 1 into knapsack
        2: ...

    Reward:
        Value of item successfully placed into knapsack or 0 if the item
        doesn't fit, at which point the episode ends.

    Starting State:
        Lists of available items and empty knapsack.

    Episode Termination:
        Full knapsack or selection that puts the knapsack over the limit.

### Creating a environment class for this problem

In [None]:
class KnapsackEnv(gym.Env):
    
    # Internal list of placed items for better rendering
    collected_items = []
    
    def __init__(self, *args, **kwargs):
        # Generate data with consistent random seed to ensure reproducibility
        self.N = 200
        self.max_weight = 200
        self.current_weight = 0
        self._max_reward = 10000
        self.mask = True
        self.seed = 0
        self.item_numbers = np.arange(self.N)
        self.item_weights = np.random.randint(1, 100, size=self.N)
        self.item_values = np.random.randint(0, 100, size=self.N)
        self.over_packed_penalty = 0
        self.randomize_params_on_reset = False
        self.collected_items.clear()
        # Add env_config, if any
        assign_env_config(self, kwargs)
        self.set_seed()

        obs_space = spaces.Box(
            0, self.max_weight, shape=(2*self.N + 1,), dtype=np.int32)
        self.action_space = spaces.Discrete(self.N)
        if self.mask:
            self.observation_space = spaces.Dict({
                "action_mask": spaces.Box(0, 1, shape=(self.N,), dtype=np.uint8),
                "avail_actions": spaces.Box(0, 1, shape=(self.N,), dtype=np.uint8),
                "state": obs_space
                })
        else:
            self.observation_space = spaces.Box(
                0, self.max_weight, shape=(2, self.N + 1), dtype=np.int32)
        
        self.reset()

    def sample_action(self):
        return np.random.choice(self.item_numbers)

    def set_seed(self, seed=None):
        if seed == None:
            seed = np.random.randint(0, np.iinfo(np.int32).max)        
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        return self._RESET()

    def step(self, action):
        return self._STEP(action)
        
    def render(self):
        total_value = 0
        total_weight = 0
        for i in range(self.N) :
            if i in self.collected_items :
                total_value += self.item_values[i]
                total_weight += self.item_weights[i]
        print(self.collected_items, total_value, total_weight)
        
        # RlLib requirement: Make sure you either return a uint8/w x h x 3 (RGB) image or handle rendering in a window and then return `True`.
        return True

## Bounded Knapsack Problem

    The Knapsack Problem (KP) is a combinatorial optimization problem which
    requires the user to select from a range of goods of different values and
    weights in order to maximize the value of the selected items within a 
    given weight limit. This version is bounded meaning each item can be
    selected a limited number of times.

    The episodes proceed by selecting items and placing them into the
    knapsack one at a time until the weight limit is reached or exceeded, at
    which point the episode ends.

    Observation:
        Type: Tuple, Discrete
        0: list of item weights
        1: list of item values
        2: list of item limits
        3: maximum weight of the knapsack
        4: current weight in knapsack

    Actions:
        Type: Discrete
        0: Place item 0 into knapsack
        1: Place item 1 into knapsack
        2: ...

    Reward:
        Value of item successfully placed into knapsack or 0 if the item
        doesn't fit, at which point the episode ends.

    Starting State:
        Lists of available items and empty knapsack.

    Episode Termination:
        Full knapsack or selection that puts the knapsack over the limit.

### Creating an environment class for Bounded Knapsack probelm

In [None]:
class BoundedKnapsackEnv(KnapsackEnv):
    
    def __init__(self, *args, **kwargs):
        self.N = 200
        self.item_limits_init = np.random.randint(1, 10, size=self.N, dtype=np.int32)
        self.item_limits = self.item_limits_init.copy()
        super().__init__()
        self.item_weights = np.random.randint(1, 100, size=self.N, dtype=np.int32)
        self.item_values = np.random.randint(0, 100, size=self.N, dtype=np.int32)

        assign_env_config(self, kwargs)

        obs_space = spaces.Box(
            0, self.max_weight, shape=(3, self.N + 1), dtype=np.int32)
        if self.mask:
            self.observation_space = spaces.Dict({
                "action_mask": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8),
                "avail_actions": spaces.Box(0, 1, shape=(len(self.item_limits),), dtype=np.uint8),
                "state": obs_space
            })
        else:
            self.observation_space = obs_space
        
    def _STEP(self, item):
        # Check item limit
        if self.item_limits[item] > 0:
            # Check that item will fit
            if self.item_weights[item] + self.current_weight <= self.max_weight:
                self.current_weight += self.item_weights[item]
                reward = self.item_values[item]
                if self.current_weight == self.max_weight:
                    done = True
                else:
                    done = False
                self._update_state(item)
            else:
                # End if over weight
                reward = 0
                done = True
        else:
            # End if item is unavailable
            reward = 0
            done = True
            
        return self.state, reward, done, {}

    def _update_state(self, item=None):
        if item is not None:
            self.item_limits[item] -= 1
        state_items = np.vstack([
            self.item_weights,
            self.item_values,
            self.item_limits
        ], dtype=np.int32)
        state = np.hstack([
            state_items, 
            np.array([[self.max_weight],
                      [self.current_weight], 
                      [0] # Serves as place holder
                ], dtype=np.int32)
        ])
        if self.mask:
            mask = np.where(self.current_weight + self.item_weights > self.max_weight, 0, 1).astype(np.uint8)
            mask = np.where(self.item_limits > 0, mask, 0)
            self.state = {
                "action_mask": mask,
                "avail_actions": np.ones(self.N, dtype=np.uint8),
                "state": state
            }
        else:
            self.state = state.copy()
        
    def sample_action(self):
        return np.random.choice(
            self.item_numbers[np.where(self.item_limits!=0)])
    
    def _RESET(self):
        if self.randomize_params_on_reset:
            self.item_weights = np.random.randint(1, 100, size=self.N, dtype=np.int32)
            self.item_values = np.random.randint(0, 100, size=self.N, dtype=np.int32)
            self.item_limits = np.random.randint(1, 10, size=self.N, dtype=np.int32)
        else:
            self.item_limits = self.item_limits_init.copy()

        self.current_weight = 0
        self._update_state()
        return self.state

##### Defining the update_state, STEP, RESET, and render methods

In [2]:
def update_state(env, item=None):
    if item is not None:
        env.item_limits[item] -= 1
        env.collected_items.append(item)
    state_items = np.vstack([
        env.item_weights,
        env.item_values,
        env.item_limits
    ], dtype=np.int32)
    state = np.hstack([
        state_items, 
        np.array([[env.max_weight],
                    [env.current_weight], 
                    [0] # Serves as place holder
            ], dtype=np.int32)
    ])
    if env.mask:
        mask = np.where(env.current_weight + env.item_weights > env.max_weight, 0, 1).astype(np.uint8)
        mask = np.where(env.item_limits > 0, mask, 0)
        env.state = {
            "action_mask": mask,
            "avail_actions": np.ones(env.N, dtype=np.uint8),
            "state": state
        }
    else:
        env.state = state.copy()

def STEP(env,item):
    if env.item_limits[item] > 0:
            # Check that item will fit
            if env.item_weights[item] + env.current_weight <= env.max_weight:
                env.current_weight += env.item_weights[item]
                reward = env.item_values[item]
                if env.current_weight == env.max_weight:
                    done = True
                else:
                    done = False
                update_state(env,item)
            else:
                # End if over weight
                reward = 0
                done = True
    else:
        # End if item is unavailable
        reward = 0
        done = True
        
    return env.state, reward, done, {}


def render(env):
    total_value = 0
    total_weight = 0
    for i in range(env.N) :
        if i in env.collected_items :
            total_value += env.item_values[i]
            total_weight += env.item_weights[i]
    print(env.collected_items, total_value, total_weight)
    pass


def RESET(env):
    if env.randomize_params_on_reset:
        env.item_weights = np.random.randint(1, 100, size=env.N, dtype=np.int32)
        env.item_values = np.random.randint(0, 100, size=env.N, dtype=np.int32)
        env.item_limits = np.random.randint(1, 10, size=env.N, dtype=np.int32)
    else:
        env.item_limits = env.item_limits_init.copy()

    env.current_weight = 0
    update_state(env)
    return env.state

#### Load the model and create the environment instances

In [None]:
model = PPO.load("ppo_model")

env = BoundedKnapsackEnv(max_weight=300, mask=False)

env = NormalizingWrapper(env)

#### Defining the policy over which the Model works for 100 and 1000 episodes

In [None]:
mean_reward, std_reward = evaluate_policy(
        model=model, 
        env=env, 
        n_eval_episodes=100, 
        deterministic=False
    )
print(f"Mean reward over 100 episodes: {mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward, std_reward = evaluate_policy(
        model=model, 
        env=env, 
        n_eval_episodes=1000, 
        deterministic=False
    )
print(f"Mean reward over 1000 episodes: {mean_reward:.2f} +/- {std_reward:.2f}")


#### Render the environment

In [None]:
render(env)

In [None]:
obs = RESET(env)
obs = np.reshape(obs, (603,))  

In [None]:
states=None
i=0
while i<300:
    action, states = model.predict(obs,states)
    obs, rewards, dones, info = STEP(env,action)
    print(rewards)
    obs = np.reshape(obs, (603,)) 
    i+=1
   

In [None]:
render(env)