In [1]:
#Cellule 1 — Imports
#But : importer les librairies nécessaires (NumPy pour les calculs, random pour le hasard).

import numpy as np
import random


In [2]:
#Cellule 2 — Nouvelle classe d’environnement avec durée des actions

#Les règles deviendront :

#P1 : récompense +2, consomme 1 matière, prend 1 unité de temps

#P2 : récompense +20, consomme 2 matières, prend 3 unités de temps

#Commander : récompense −5, ajoute 5 matières, prend 1 unité de temps

#Attendre : 0 ou −1 si stock=0, prend 1 unité de temps


class WorkshopEnvV2:
    """
    Environnement d'atelier avec durée variable des actions.
    """

    def __init__(self, max_steps=50):
        self.max_steps = max_steps
        self.reset()

    def reset(self):
        # Stock initial aléatoire entre 0 et 10
        self.stock_raw = np.random.randint(0, 11)
        self.stock_sell = 0
        
        # Compteur de temps
        self.steps = 0
        
        return (self.stock_raw, self.stock_sell)

    def step(self, action):
        reward = 0
        
        # Durées des actions
        duration_P1 = 1
        duration_P2 = 3
        duration_cmd = 1
        duration_wait = 1

        # ACTION 1 : Produire Produit 1
        if action == 1:
            if self.stock_raw >= 1:
                self.stock_raw -= 1
                self.stock_sell += 1
                reward = 2
            self.steps += duration_P1

        # ACTION 2 : Produire Produit 2
        elif action == 2:
            if self.stock_raw >= 2:
                self.stock_raw -= 2
                self.stock_sell += 1
                reward = 20
            self.steps += duration_P2

        # ACTION 3 : Commander
        elif action == 3:
            self.stock_raw += 5
            reward = -5
            self.steps += duration_cmd

        # ACTION 0 : Attendre
        elif action == 0:
            reward = -1 if self.stock_raw == 0 else 0
            self.steps += duration_wait

        # BORNES
        self.stock_raw = max(0, min(10, self.stock_raw))
        self.stock_sell = max(0, min(10, self.stock_sell))

        # FIN D'ÉPISODE
        done = self.steps >= self.max_steps

        next_state = (self.stock_raw, self.stock_sell)
        return next_state, reward, done, {}


In [3]:
# Cellule 3 — Instanciation de l'environnement
env = WorkshopEnvV2(max_steps=50)


In [4]:
# Cellule 4 — Table-Q et hyperparamètres

n_states_raw = 11
n_states_sell = 11
n_actions = 4

Q = np.zeros((n_states_raw, n_states_sell, n_actions))

alpha = 0.1
gamma = 0.95

epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.995


In [5]:
# Cellule 5 — Choix d'action ε-greedy

def choose_action(state, epsilon):
    stock_raw, stock_sell = state
    
    if random.random() < epsilon:
        return random.randint(0, n_actions - 1)
    
    return int(np.argmax(Q[stock_raw, stock_sell, :]))


In [6]:
# Cellule 6 — Boucle d'entraînement

n_episodes = 5000
rewards_per_episode = []

for episode in range(n_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = choose_action(state, epsilon)

        next_state, reward, done, info = env.step(action)

        sr, ss = state
        nsr, nss = next_state
        
        best_next_Q = np.max(Q[nsr, nss, :])
        
        Q[sr, ss, action] += alpha * (reward + gamma * best_next_Q - Q[sr, ss, action])
        
        state = next_state
        total_reward += reward

    rewards_per_episode.append(total_reward)

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay
        
    if (episode + 1) % 100 == 0:
        print(f"Episode {episode+1} : Reward total = {total_reward:.1f}, epsilon = {epsilon:.3f}")


Episode 100 : Reward total = 196.0, epsilon = 0.606
Episode 200 : Reward total = 237.0, epsilon = 0.367
Episode 300 : Reward total = 237.0, epsilon = 0.222
Episode 400 : Reward total = 254.0, epsilon = 0.135
Episode 500 : Reward total = 254.0, epsilon = 0.100
Episode 600 : Reward total = 238.0, epsilon = 0.100
Episode 700 : Reward total = 261.0, epsilon = 0.100
Episode 800 : Reward total = 256.0, epsilon = 0.100
Episode 900 : Reward total = 277.0, epsilon = 0.100
Episode 1000 : Reward total = 259.0, epsilon = 0.100
Episode 1100 : Reward total = 238.0, epsilon = 0.100
Episode 1200 : Reward total = 254.0, epsilon = 0.100
Episode 1300 : Reward total = 253.0, epsilon = 0.100
Episode 1400 : Reward total = 272.0, epsilon = 0.100
Episode 1500 : Reward total = 253.0, epsilon = 0.100
Episode 1600 : Reward total = 280.0, epsilon = 0.100
Episode 1700 : Reward total = 261.0, epsilon = 0.100
Episode 1800 : Reward total = 258.0, epsilon = 0.100
Episode 1900 : Reward total = 275.0, epsilon = 0.100
Ep

In [7]:
# Cellule 7 — Inspection de la politique

def best_action_for_state(stock_raw, stock_sell):
    q_vals = Q[stock_raw, stock_sell, :]
    best_a = int(np.argmax(q_vals))
    return best_a, q_vals

for sr in range(0, 11):
    action, qv = best_action_for_state(sr, 0)
    print(f"État ({sr},0) → meilleure action = {action}, Q-values = {qv}")


État (0,0) → meilleure action = 3, Q-values = [127.59260482 137.7296523  142.95576942 223.70320792]
État (1,0) → meilleure action = 1, Q-values = [119.85153479 235.76827929 121.26607758 122.64238634]
État (2,0) → meilleure action = 2, Q-values = [131.42953518  72.51343674 253.76827928 132.13549047]
État (3,0) → meilleure action = 2, Q-values = [135.14609144 141.71911636 233.25812505 153.67407052]
État (4,0) → meilleure action = 2, Q-values = [142.60385798 124.47999572 250.35812512 153.76076923]
État (5,0) → meilleure action = 2, Q-values = [196.09473249 182.66384224 240.74021893 205.65638345]
État (6,0) → meilleure action = 2, Q-values = [123.7857802  190.7755053  257.84021874 139.70088679]
État (7,0) → meilleure action = 2, Q-values = [217.78324882 205.22542495 271.07187296 159.59015883]
État (8,0) → meilleure action = 2, Q-values = [152.12881229 122.18678779 256.50051889 144.98685224]
État (9,0) → meilleure action = 2, Q-values = [168.58618217 186.20146713 266.23223699 109.3403735 ]
