In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np

from env import MeatBuyingDiscreteEnv
from model import MeatBuyingQLAgent

In [None]:
rng = np.random.default_rng(44)
env = MeatBuyingDiscreteEnv(rng=rng, N=10)
env

<env.MeatBuyingDiscreteEnv at 0x111acb700>

In [57]:
print(env.is_halal)
print(env.is_discounted)
print(env.days_to_expiry)

[0 0 0 1 0 1 0 0 1 1]
[0 0 0 0 0 0 0 0 0 0]
[9 2 7 8 6 1 7 5 9 2]


In [76]:
agent = MeatBuyingQLAgent(env.N, env.num_actions)

print("Meat options =", env.N)
print("Num actions =", env.num_actions)
print(f"Q-table size (num actions ^ N options) = {np.prod(agent.q_table.shape):,}")

Meat options = 10
Num actions = 5
Q-table size (num actions ^ N options) = 9,765,625


In [12]:
MAX_EPISODE = 10_000
EVAL_EVERY = 200

rng = np.random.default_rng(44)

BUDGET = 50
REQUIRED_GRAMS = 2500

env = MeatBuyingDiscreteEnv(rng=rng, N=10, budget=BUDGET, required_grams=REQUIRED_GRAMS)
agent = MeatBuyingQLAgent(env.N, env.num_actions)

In [13]:
for episode in range(MAX_EPISODE):
    state = env.reset(rng=rng)
    done = False
    while not done:
        action = agent.select_action(rng=rng)
        next_state, reward, done, _ = env.step(action)
        agent.update(action, reward)
        state = next_state
    
    if episode % EVAL_EVERY == 0:
        print(f"Episode {episode}")
        best_action = np.unravel_index(np.argmax(agent.q_table, axis=None), agent.q_table.shape)
        best_action = list(map(lambda x: x.item(), best_action))
        print(f"Best actions per meat (weight indices): {best_action}")
        print(f"Best weights per meat: {[env.weight_options[a] for a in best_action]}")

        print("DECISION SUMMARY")
        print("="*50)
        for idx, a in enumerate(best_action):
            is_buy = a > 0
            if not is_buy:
                print(f"MEAT {idx + 1}: SKIP")
                print("-" * 30)
                continue

            halal_status = "✅ Halal" if env.is_halal[idx] == 1 else "❌ Haram"
            discounted_status = "💰 Discounted" if env.is_discounted[idx] == 1 else "No discount"
            weight_str = f"{env.weight_options[a]} grams"

            print(f"MEAT {idx + 1}")
            print(f"  Action: BUY {weight_str}")
            print(f"  Halal Status: {halal_status}")
            print(f"  Discount Status: {discounted_status}")
            print(f"  Days to expiry: {env.days_to_expiry[idx]} day(s)")
            print("-" * 50)

        print(f"TOTAL COST SPENT: £{env.total_cost:.2f}")
        print("="*50 + "\n")

Episode 0
Best actions per meat (weight indices): [0, 0, 0, 0, 0, 0, 0, 0, 0, 3]
Best weights per meat: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1000]
DECISION SUMMARY
MEAT 1: SKIP
------------------------------
MEAT 2: SKIP
------------------------------
MEAT 3: SKIP
------------------------------
MEAT 4: SKIP
------------------------------
MEAT 5: SKIP
------------------------------
MEAT 6: SKIP
------------------------------
MEAT 7: SKIP
------------------------------
MEAT 8: SKIP
------------------------------
MEAT 9: SKIP
------------------------------
MEAT 10
  Action: BUY 1000 grams
  Halal Status: ❌ Haram
  Discount Status: No discount
  Days to expiry: 3 day(s)
--------------------------------------------------
TOTAL COST SPENT: £200.00

Episode 200
Best actions per meat (weight indices): [0, 0, 0, 0, 0, 0, 1, 2, 3, 4]
Best weights per meat: [0, 0, 0, 0, 0, 0, 250, 500, 1000, 2000]
DECISION SUMMARY
MEAT 1: SKIP
------------------------------
MEAT 2: SKIP
------------------------------
MEAT