In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt


# Define Environment
class ProductAllocationEnv:
    def __init__(self, num_products, num_shops, max_inventory):
        self.num_products = num_products
        self.num_shops = num_shops
        self.max_inventory = max_inventory
        self.state = np.zeros((num_products, num_shops))  # Allocation matrix
        self.inventory = np.random.randint(
            10, max_inventory, size=num_products
        )  # Different inventory for each product
        self.demand = np.random.randint(
            1, max_inventory // 2, size=(num_products, num_shops)
        )  # Different demand per shop per product
        self.safety_stock_ratio = 0.1  # 10% of inventory must be held
        self.product_cost = np.random.uniform(
            1, 5, size=num_products
        )  # Cost per product
        self.profit_margin = np.random.uniform(
            1.1, 2.0, size=(num_products, num_shops)
        )  # Profit per product
        self.shipping_cost = np.random.uniform(
            0.5, 2.0, size=(num_products, num_shops)
        )  # Shipping cost per unit
        self.shelf_life = np.random.randint(
            5, 30, size=self.num_products
        )  # Days left before expiry
        self.restock_cycle = 7  # Days before restocking happens

    def reset(self):
        self.state = np.zeros((self.num_products, self.num_shops))
        self.inventory = np.random.randint(
            10, self.max_inventory, size=self.num_products
        )  # Reset inventory per product
        self.demand = np.random.randint(
            1, self.max_inventory // 2, size=(self.num_products, self.num_shops)
        )  # Reset demand per product per shop
        return self.state.flatten()

    def step(self, action):
        product, shop, allocation = action
        min_inventory = np.ceil(self.inventory * self.safety_stock_ratio).astype(int)
        allocation = min(
            allocation,
            self.inventory[product] - min_inventory[product],
            self.demand[product, shop] - self.state[product, shop],
        )
        self.inventory[product] -= allocation
        self.state[product, shop] += allocation

        # Reward Function: Profit Maximization & Expiry Consideration
        priority_factor = (
            1 / self.shelf_life[product]
        )  # Higher priority to soon-expiring products
        reward = (allocation * self.profit_margin[product, shop]) - (
            allocation * self.product_cost[product]
        )
        reward += (
            allocation * priority_factor
        )  # Encourage allocation of soon-expiring items
        reward -= (
            allocation * self.shipping_cost[product, shop]
        )  # Penalize high shipping cost

        done = np.all(self.state >= self.demand) or np.all(self.inventory == 0)
        return self.state.flatten(), reward, done

    def restock(self, day):
        if day % self.restock_cycle == 0:
            self.inventory += np.random.randint(
                10, 50, size=self.num_products
            )  # Simulate restocking

    def render(self):
        print("Inventory:", self.inventory)
        print("Demand:", self.demand)
        print("Current Allocation:", self.state)


# Q-Learning Agent
class QLearningAgent:
    def __init__(
        self, num_products, num_shops, max_inventory, alpha=0.1, gamma=0.9, epsilon=0.1
    ):
        self.num_products = num_products
        self.num_shops = num_shops
        self.q_table = np.zeros((num_products, num_shops, max_inventory + 1))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return (
                np.random.randint(0, self.num_products),
                np.random.randint(0, self.num_shops),
                np.random.randint(1, 10),
            )
        else:
            return np.unravel_index(
                np.argmax(self.q_table, axis=None), self.q_table.shape
            )

    def update_q_table(self, state, action, reward, next_state):
        product, shop, allocation = action
        best_next_q = np.max(self.q_table)
        self.q_table[product, shop, allocation] += self.alpha * (
            reward + self.gamma * best_next_q - self.q_table[product, shop, allocation]
        )


# Train the RL Model
def train_agent(env, agent, episodes=1000):
    rewards = []
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        day = 0

        while not done:
            env.restock(day)
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.update_q_table(state, action, reward, next_state)
            state = next_state
            total_reward += reward
            day += 1

        rewards.append(total_reward)
        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}")

    return rewards


# Initialize environment & agent
env = ProductAllocationEnv(num_products=10, num_shops=5, max_inventory=100)
agent = QLearningAgent(num_products=10, num_shops=5, max_inventory=100)

# Train agent
reward_history = train_agent(env, agent, episodes=500)

# Plot training progress
plt.plot(reward_history)
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.title("Q-Learning Training Progress")
plt.show()

# Evaluate the trained model
env.render()

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt


# Define Environment
class ProductAllocationEnv:
    def __init__(self, num_products, num_shops, max_inventory):
        self.num_products = num_products
        self.num_shops = num_shops
        self.max_inventory = max_inventory
        self.state = np.zeros((num_products, num_shops))  # Allocation matrix
        self.inventory = np.random.randint(
            10, max_inventory, size=num_products
        )  # Different inventory for each product
        self.demand = np.random.randint(
            1, max_inventory // 2, size=(num_products, num_shops)
        )  # Different demand per shop per product
        self.safety_stock_ratio = 0.1  # 10% of inventory must be held
        self.product_cost = np.random.uniform(
            1, 5, size=num_products
        )  # Cost per product
        self.profit_margin = np.random.uniform(
            1.1, 2.0, size=(num_products, num_shops)
        )  # Profit per product
        self.shipping_cost = np.random.uniform(
            0.5, 2.0, size=(num_products, num_shops)
        )  # Shipping cost per unit
        self.shelf_life = np.random.randint(
            5, 30, size=self.num_products
        )  # Days left before expiry
        self.restock_cycle = 7  # Days before restocking happens

    def reset(self):
        self.state = np.zeros((self.num_products, self.num_shops))
        self.inventory = np.random.randint(
            10, self.max_inventory, size=self.num_products
        )  # Reset inventory per product
        self.demand = np.random.randint(
            1, self.max_inventory // 2, size=(self.num_products, self.num_shops)
        )  # Reset demand per product per shop
        return self.state.flatten()

    def step(self, action):
        product, shop, allocation = action
        min_inventory = np.ceil(self.inventory * self.safety_stock_ratio).astype(int)
        allocation = min(
            allocation,
            self.inventory[product] - min_inventory[product],
            self.demand[product, shop] - self.state[product, shop],
        )
        self.inventory[product] -= allocation
        self.state[product, shop] += allocation

        # Reward Function: Profit Maximization & Expiry Consideration
        priority_factor = (
            1 / self.shelf_life[product]
        )  # Higher priority to soon-expiring products
        reward = (allocation * self.profit_margin[product, shop]) - (
            allocation * self.product_cost[product]
        )
        reward += (
            allocation * priority_factor
        )  # Encourage allocation of soon-expiring items
        reward -= (
            allocation * self.shipping_cost[product, shop]
        )  # Penalize high shipping cost

        done = np.all(self.state >= self.demand) or np.all(self.inventory == 0)
        return self.state.flatten(), reward, done

    def restock(self, day):
        if day % self.restock_cycle == 0:
            self.inventory += np.random.randint(
                10, 50, size=self.num_products
            )  # Simulate restocking

    def render(self):
        print("Inventory:", self.inventory)
        print("Demand:", self.demand)
        print("Current Allocation:", self.state)


# Q-Learning Agent
class QLearningAgent:
    def __init__(
        self, num_products, num_shops, max_inventory, alpha=0.1, gamma=0.9, epsilon=0.1
    ):
        self.num_products = num_products
        self.num_shops = num_shops
        self.q_table = np.zeros((num_products, num_shops, max_inventory + 1))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return (
                np.random.randint(0, self.num_products),
                np.random.randint(0, self.num_shops),
                np.random.randint(1, 10),
            )
        else:
            return np.unravel_index(
                np.argmax(self.q_table, axis=None), self.q_table.shape
            )

    def update_q_table(self, state, action, reward, next_state):
        product, shop, allocation = action
        best_next_q = np.max(self.q_table)
        self.q_table[product, shop, allocation] += self.alpha * (
            reward + self.gamma * best_next_q - self.q_table[product, shop, allocation]
        )


# Train the RL Model
def train_agent(env, agent, episodes=1000):
    rewards = []
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        day = 0

        while not done:
            env.restock(day)
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.update_q_table(state, action, reward, next_state)
            state = next_state
            total_reward += reward
            day += 1

        rewards.append(total_reward)
        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}")

    return rewards


# Initialize environment & agent
env = ProductAllocationEnv(num_products=10, num_shops=5, max_inventory=100)
agent = QLearningAgent(num_products=10, num_shops=5, max_inventory=100)

# Train agent
reward_history = train_agent(env, agent, episodes=500)

# Plot training progress
plt.plot(reward_history)
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.title("Q-Learning Training Progress")
plt.show()

# Evaluate the trained model
env.render()