# Step 8: Reinforcement Learning for Pricing

## 8.1 Define the State-Space

In [None]:
import numpy as np
import pandas as pd
import random

# Demand levels
demand_levels = {"Low": 0, "Medium": 1, "High": 2}

# Defining price bins (10 bins)
price_bins = np.round(np.linspace(0.10, 0.50, 10), 2)
competitor_bins = np.round(np.linspace(0.10, 0.50, 10), 2)

# Defining action space (Increase, Decrease, Keep Price)
actions = [-0.05, 0.00, 0.05]

# Defining state space (Demand x Price x Competitor Price)
state_space = [(d, p, c) for d in demand_levels.values() for p in price_bins for c in competitor_bins]

# Initializing Q-table with string-based indexing
Q_table = pd.DataFrame(0, index=pd.MultiIndex.from_tuples(state_space, names=["Demand Level", "Current Price", "Competitor Price"]),
                       columns=actions)
Q_table = Q_table.astype(float)


## 8.2 Implement Q-Learning for Pricing

In [None]:
def calculate_reward(price, demand, competitor_price):
    """
    Reward function: Encourages price increases when demand is high,
    while still penalizing overpricing.
    """
    base_demand = {0: 5, 1: 15, 2: 30}  # Low, Medium, High demand levels
    demand_effect = base_demand[demand] * np.exp(-1.5 * (price - competitor_price))  # Reduce penalty

    # Introduce extra reward for high-demand situations
    high_demand_bonus = 5 if demand == 2 and price >= competitor_price else 0

    penalty = max(0, (price - competitor_price) * 3)  # Reduce penalty from 5 to 3

    revenue = price * demand_effect + high_demand_bonus  # Add bonus for high demand
    return revenue - penalty


# Q-Learning Hyperparameters
alpha = 0.2  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Initial exploration rate
epsilon_decay = 0.997  # Slower decay to keep exploring longer
epsilon_min = 0.1
episodes = 1000  # 1000 for better learning

# Trainnifgn Q-Learning Model
for episode in range(episodes):
    demand = random.choice(list(demand_levels.values()))
    price = str(round(random.choice(price_bins), 2))  # Converttoing to string
    competitor_price = str(round(random.choice(competitor_bins), 2))  # Convertting to string

    state = (demand, price, competitor_price)

    for step in range(10):  #  10 steps per episode
        if state not in Q_table.index:
            Q_table.loc[state] = 0  # Initializing missing state

        # Choose action (epsilon-greedy policy)
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)  # Exploring
        else:
            action = Q_table.loc[state, :].idxmax()  # Exploiting best action

        # Applying action
        new_price = str(round(max(0.10, min(0.50, float(price) + action)), 2))  # Keeping within bounds
        new_state = (demand, new_price, competitor_price)

        if new_state not in Q_table.index:
            Q_table.loc[new_state] = 0  # Ensurng new state exists

        # Calculating reward
        reward = calculate_reward(float(new_price), demand, float(competitor_price))

        # Q-Learning update rule
        Q_table.loc[state, action] = (1 - alpha) * Q_table.loc[state, action] + \
                                     alpha * (reward + gamma * Q_table.loc[new_state, :].max())

        state = new_state

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay  # Reducing exploration over time

optimized_pricing_policy = Q_table.idxmax(axis=1).reset_index()
optimized_pricing_policy.columns = ["Demand Level", "Current Price", "Competitor Price", "Optimal Price Adjustment"]
optimized_pricing_policy


Unnamed: 0,Demand Level,Current Price,Competitor Price,Optimal Price Adjustment
0,0,0.1,0.1,-0.05
1,0,0.1,0.14,-0.05
2,0,0.1,0.19,-0.05
3,0,0.1,0.23,-0.05
4,0,0.1,0.28,-0.05
...,...,...,...,...
817,2,0.15,0.1,-0.05
818,1,0.42,0.19,-0.05
819,0,0.27,0.19,-0.05
820,2,0.15,0.37,-0.05
