<a href="https://colab.research.google.com/github/rani227/SmartChainRL/blob/main/SmartChainRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q stable-baselines3[extra] optuna sb3-contrib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.8/92.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.4/965.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.5/184.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
%%writefile supply_chain_discrete_env.py
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class SupplyChainDiscreteEnv(gym.Env):
    def __init__(self):
        super(SupplyChainDiscreteEnv, self).__init__()
        self.num_warehouses = 3
        self.num_stores = 5
        self.bucket_levels = 11
        self.max_stock = 150
        self.max_demand = 30

        self.action_space = spaces.MultiDiscrete([self.bucket_levels] * (self.num_warehouses * self.num_stores))
        self.observation_space = spaces.Box(low=0, high=200, shape=(self.num_warehouses + self.num_stores,), dtype=np.float32)
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.warehouse_stock = np.full(self.num_warehouses, self.max_stock)
        self.store_demand = np.random.randint(10, self.max_demand, size=self.num_stores)
        obs = np.concatenate((self.warehouse_stock, self.store_demand)).astype(np.float32)
        return obs, {}

    def step(self, action):
        self.current_step += 1
        fulfilled = np.zeros(self.num_stores)
        delivery_cost = 0
        buffer_bonus = 0
        idx = 0

        allocations = np.zeros((self.num_warehouses, self.num_stores))
        for w in range(self.num_warehouses):
            for s in range(self.num_stores):
                qty_requested = action[idx] * 10
                qty_to_send = min(qty_requested, self.warehouse_stock[w])
                allocations[w][s] = qty_to_send
                fulfilled[s] += qty_to_send
                self.warehouse_stock[w] -= qty_to_send
                delivery_cost += 0.4 * qty_to_send
                idx += 1

        stockout_penalty = np.sum(np.maximum(0, self.store_demand - fulfilled)) * 1.5
        oversupply_penalty = np.sum(np.maximum(0, fulfilled - self.store_demand)) * 0.3

        if np.random.rand() < 0.05:
            disrupted = np.random.randint(0, self.num_warehouses)
            self.warehouse_stock[disrupted] = 0

        avg_demand = np.mean(self.store_demand)
        replenishment = np.random.randint(int(avg_demand / 2), int(avg_demand * 1.2), size=self.num_warehouses)
        self.warehouse_stock = np.minimum(self.warehouse_stock + replenishment, self.max_stock)

        for stock in self.warehouse_stock:
            if stock >= 0.3 * self.max_stock:
                buffer_bonus += 5

        self.store_demand = np.random.randint(10, self.max_demand, size=self.num_stores)
        reward = - (delivery_cost + stockout_penalty + oversupply_penalty) + buffer_bonus


        obs = np.concatenate((self.warehouse_stock, self.store_demand)).astype(np.float32)
        return obs, reward, False, False, {}

Writing supply_chain_discrete_env.py


In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from supply_chain_discrete_env import SupplyChainDiscreteEnv


env = DummyVecEnv([lambda: SupplyChainDiscreteEnv()])

model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    n_steps=1024,
    batch_size=64,
    gamma=0.99,
    learning_rate=3e-4,
    policy_kwargs={"net_arch": [128, 128]},
)

model.learn(total_timesteps=100_000)
model.save("smartchain_ppo_default_model")

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 197  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 1024 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 2            |
|    time_elapsed         | 12           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0058960877 |
|    clip_fraction        | 0.0257       |
|    clip_range           | 0.2          |
|    entropy_loss         | -36          |
|    explained_variance   | -0.000206    |
|    learning_rate        | 0.0003       |
|    loss                 | 2.46e+06     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0238      |
|    value_loss           | 4.97e+06     |
------------------------------------------

In [None]:
!pip install -q streamlit

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m358.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
%%writefile dashboard.py
import streamlit as st
import numpy as np
import pandas as pd
from supply_chain_discrete_env import SupplyChainDiscreteEnv
from stable_baselines3 import PPO

st.set_page_config(layout="wide")
st.title("📦 SmartChain RL Agent Simulation Dashboard")


model = PPO.load("smartchain_ppo_default_model")

st.sidebar.header("⚙️ Simulation Settings")
num_steps = st.sidebar.slider("Number of steps", min_value=20, max_value=200, value=100)
compare_with_random = st.sidebar.checkbox("Compare with Random Agent", value=True)

def run_simulation(agent=None):
    env = SupplyChainDiscreteEnv()
    obs, _ = env.reset()

    warehouse_stock, store_demand, reward_list, shipment_log, stockouts = [], [], [], [], []

    for _ in range(num_steps):
        if agent == "ppo":
            action, _ = model.predict(obs, deterministic=True)
        elif agent == "random":
            action = env.action_space.sample()
        else:
            raise ValueError("Unsupported agent")

        obs, reward, terminated, truncated, _ = env.step(action)
        warehouse_stock.append(env.warehouse_stock.copy())
        store_demand.append(env.store_demand.copy())
        reward_list.append(reward)
        shipment_log.append(action)
        stockouts.append(np.sum(np.maximum(env.store_demand - np.sum(action, axis=0), 0)))

    return {
        "warehouse": warehouse_stock,
        "demand": store_demand,
        "reward": reward_list,
        "shipment": shipment_log,
        "stockout": stockouts
    }

st.subheader("🤖 PPO Agent Performance")
ppo_data = run_simulation(agent="ppo")

st.subheader("📊 Warehouse Stock Levels Over Time")
df_warehouse = pd.DataFrame(ppo_data["warehouse"], columns=[f"W{i+1}" for i in range(3)])
st.line_chart(df_warehouse)

st.subheader("🛒 Store Demands Over Time")
df_demand = pd.DataFrame(ppo_data["demand"], columns=[f"S{i+1}" for i in range(5)])
st.line_chart(df_demand)

st.subheader("💰 Reward Over Time")
df_reward = pd.DataFrame({"Reward": ppo_data["reward"]})
st.line_chart(df_reward)

avg_reward = np.mean(ppo_data["reward"])
total_stockouts = np.sum(ppo_data["stockout"])
st.success(f"**Average Reward:** {avg_reward:.2f}")
st.error(f"**Total Stockouts:** {total_stockouts}")

if compare_with_random:
    st.subheader("🎲 Random Agent Comparison")
    random_data = run_simulation(agent="random")
    st.line_chart(pd.DataFrame({"PPO Reward": ppo_data["reward"], "Random Reward": random_data["reward"]}))
    st.warning(f"📉 Random Agent Avg Reward: {np.mean(random_data['reward']):.2f}")

Writing dashboard.py


In [None]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared

--2025-07-14 15:16:53--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2025.7.0/cloudflared-linux-amd64 [following]
--2025-07-14 15:16:53--  https://github.com/cloudflare/cloudflared/releases/download/2025.7.0/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/106867604/37d2bad8-a2ed-4b93-8139-cbb15162d81d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250714%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250714T151553Z&X-Amz-Expires=1800&X-Amz-Signature=557f3d0cd5bd737bfa251a7a402435f2fad8778ffbe7d03c5261405cbc8bc248&X-Am

In [None]:
!streamlit run dashboard.py &>/content/log.txt &
!./cloudflared tunnel --url http://localhost:8501

[90m2025-07-14T15:16:57Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-07-14T15:16:57Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-07-14T15:17:01Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-07-14T15:17:01Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025