In [2]:
!pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-2.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting gymnasium<1.1.0,>=0.29.1 (from stable-baselines3)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (f

In [4]:
!pip install shimmy

Collecting shimmy
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Downloading Shimmy-2.0.0-py3-none-any.whl (30 kB)
Installing collected packages: shimmy
Successfully installed shimmy-2.0.0


In [16]:
pip install gymnasium[all] stable-baselines3


Collecting box2d-py==2.3.5 (from gymnasium[all])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[all])
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting mujoco-py<2.2,>=2.1 (from gymnasium[all])
  Downloading mujoco_py-2.1.2.14-py3-none-any.whl.metadata (669 bytes)
Collecting cython<3 (from gymnasium[all])
  Downloading Cython-0.29.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Collecting mujoco>=2.1.5 (from gymnasium[all])
  Downloading mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (fr

In [21]:
import gymnasium as gym  # ✅ Use Gymnasium instead of Gym
import numpy as np
from gymnasium import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

In [22]:
# ✅ Custom Supply Chain Environment (Fixed Inheritance)
class SupplyChainEnv(gym.Env):  # ✅ Explicitly inherit from gym.Env
    def __init__(self):
        super(SupplyChainEnv, self).__init__()

        # ✅ Define Action Space: {0: Reduce Stock, 1: Maintain, 2: Increase Stock}
        self.action_space = spaces.Discrete(3)

        # ✅ Define Observation Space: Inventory level (bounded between 0 and 100)
        self.observation_space = spaces.Box(low=0, high=100, shape=(1,), dtype=np.float32)

        # ✅ Initial inventory level
        self.state = np.array([50], dtype=np.float32)

        # ✅ Maximum simulation steps
        self.max_steps = 50
        self.current_step = 0

    def step(self, action):
        """
        Take an action in the environment and update inventory state.
        """
        self.current_step += 1

        # Action Mapping:
        if action == 0:  # Reduce Stock
            self.state[0] -= 10
        elif action == 2:  # Increase Stock
            self.state[0] += 10

        # ✅ Demand Simulation: Randomized demand between 5 and 15 units
        demand = np.random.randint(5, 15)
        self.state[0] -= demand  # Deduct demand from stock
        self.state[0] = np.clip(self.state[0], 0, 100)  # Ensure inventory stays within bounds

        # ✅ Reward Function: Encourage inventory level close to 50
        reward = -abs(50 - self.state[0])  # Best reward when inventory ≈ 50

        # ✅ Termination Condition
        done = self.current_step >= self.max_steps

        return self.state, reward, done, False, {}  # ✅ Follow Gymnasium's API

    def reset(self, seed=None, options=None):
        """
        Reset the environment to the initial state.
        """
        self.state = np.array([50], dtype=np.float32)
        self.current_step = 0
        return self.state, {}  # ✅ Must return (obs, info) in Gymnasium

    def render(self):
        """
        Render the environment (prints inventory level).
        """
        print(f"Step: {self.current_step}, Inventory Level: {self.state[0]}")


In [23]:
# ✅ Create Environment & Wrap it Properly (Fixed Wrapping Issue)
env = SupplyChainEnv()
vec_env = DummyVecEnv([lambda: Monitor(env, filename=None)])  # ✅ Prevent logging errors

In [24]:
# ✅ Train DQN Model
model = DQN("MlpPolicy", vec_env, verbose=1, learning_rate=0.001, batch_size=32)
model.learn(total_timesteps=5000)

Using cpu device
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 50        |
|    ep_rew_mean      | -2.38e+03 |
|    exploration_rate | 0.62      |
| time/               |           |
|    episodes         | 4         |
|    fps              | 1143      |
|    time_elapsed     | 0         |
|    total_timesteps  | 200       |
| train/              |           |
|    learning_rate    | 0.001     |
|    loss             | 46.3      |
|    n_updates        | 24        |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 50        |
|    ep_rew_mean      | -2.34e+03 |
|    exploration_rate | 0.24      |
| time/               |           |
|    episodes         | 8         |
|    fps              | 1097      |
|    time_elapsed     | 0         |
|    total_timesteps  | 400       |
| train/              |           |
|    learning_rate    | 0.001     |
|    loss  

<stable_baselines3.dqn.dqn.DQN at 0x78fe4e00fdd0>

In [25]:
# ✅ Save the trained model
model.save("supply_chain_dqn")

In [26]:
# ✅ Load and Test the Model
model = DQN.load("supply_chain_dqn")
obs = vec_env.reset()

In [27]:
for _ in range(20):
    action, _states = model.predict(obs)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render()
    if done:
        obs, _ = vec_env.reset()


