<a href="https://colab.research.google.com/github/sjpritchard2001/MSBA-Team-14/blob/main/PerishableGoodsReinforcementLearningEx.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random

# Define the environment for perishable goods pricing
class PerishableGoodsEnvironment:
    def __init__(self, max_shelf_life, max_price, min_price):
        self.max_shelf_life = max_shelf_life  # Max number of days the product lasts
        self.max_price = max_price  # Maximum price for the product
        self.min_price = min_price  # Minimum price for the product
        self.reset()

    def reset(self):
        self.shelf_life = self.max_shelf_life  # Reset shelf life to max at the start
        self.price = random.uniform(self.min_price, self.max_price)  # Random initial price
        return self.shelf_life

    def step(self, action):
        # Action is price, modify shelf life based on this action
        self.price = np.clip(action, self.min_price, self.max_price)

        # Calculate demand and reward based on price and shelf life
        demand = self.calculate_demand(self.price)
        reward = demand * self.price  # Revenue = price * demand

        # Decrease shelf life by 1 day
        self.shelf_life -= 1

        # If the product has expired, return negative reward
        if self.shelf_life <= 0:
            reward -= 50  # Penalty for expired goods

        done = self.shelf_life <= 0  # Episode ends if the product expires

        return self.shelf_life, reward, done

    def calculate_demand(self, price):
        # A simple demand function that decreases with increasing price
        # Demand is higher when the price is low and lower when price is high
        max_demand = 100  # Max demand when price is at min price
        demand = max_demand * (self.max_price - price) / (self.max_price - self.min_price)
        return max(0, demand)  # Ensure demand is non-negative

In [5]:
class QLearningAgent:
    def __init__(self, action_space, learning_rate=0.1, discount_factor=0.9, exploration_rate=1.0, exploration_decay=0.995):
        self.action_space = action_space  # Set of possible prices
        self.learning_rate = learning_rate  # Learning rate
        self.discount_factor = discount_factor  # Discount factor for future rewards
        self.exploration_rate = exploration_rate  # Exploration-exploitation tradeoff
        self.exploration_decay = exploration_decay  # Decay of exploration rate
        self.q_table = np.zeros(len(action_space))  # Q-table initialization

    def choose_action(self):
        if random.uniform(0, 1) < self.exploration_rate:
            # Exploration: choose random action (price)
            return random.choice(self.action_space)
        else:
            # Exploitation: choose action with the highest Q-value
            return self.action_space[np.argmax(self.q_table)]

    def learn(self, state, action, reward, next_state):
        # Use np.where to find the index of the action in action_space
        action_index = np.where(self.action_space == action)[0][0]
        next_best_action = np.argmax(self.q_table)  # Best future action

        # Update Q-value using the Q-learning update rule
        self.q_table[action_index] += self.learning_rate * (reward + self.discount_factor * self.q_table[next_best_action] - self.q_table[action_index])

        # Decay the exploration rate to shift towards exploitation over time
        self.exploration_rate *= self.exploration_decay

In [6]:
# Set up the environment and agent
max_shelf_life = 10  # Product shelf life in days
max_price = 20  # Maximum price
min_price = 5  # Minimum price
action_space = np.linspace(min_price, max_price, 20)  # Possible prices

# Initialize environment and agent
env = PerishableGoodsEnvironment(max_shelf_life, max_price, min_price)
agent = QLearningAgent(action_space)

# Training loop
num_episodes = 5000
for episode in range(num_episodes):
    state = env.reset()  # Reset the environment
    done = False
    total_reward = 0

    while not done:
        action = agent.choose_action()  # Choose an action (price)
        next_state, reward, done = env.step(action)  # Take a step in the environment
        agent.learn(state, action, reward, next_state)  # Update the agent's Q-table

        state = next_state  # Move to the next state
        total_reward += reward  # Accumulate reward for this episode

    # Optionally, print out progress
    if episode % 1000 == 0:
        print(f"Episode {episode}/{num_episodes} - Total Reward: {total_reward}")

print("Training complete.")

Episode 0/5000 - Total Reward: 4443.074792243768
Episode 1000/5000 - Total Reward: 6390.4432132964
Episode 2000/5000 - Total Reward: 6390.4432132964
Episode 3000/5000 - Total Reward: 6390.4432132964
Episode 4000/5000 - Total Reward: 6390.4432132964
Training complete.
