In [8]:
import random
import numpy as np
from collections import defaultdict

#### Reinforcement learning components

In [9]:
class SimpleEnvironment:

    def __init__(self):
        self.grid_size = 4
        self.agent_pos = [0, 0]
        self.goal_pos = [3, 3]   # Goal position
        self.obstacles = [[1, 1], [2, 1]]  # Obstacle positions

        print("=== ENVIRONMENT SETUP ===")
        print(f"Grid size: {self.grid_size}x{self.grid_size}")
        print(f"Agent starts at: {self.agent_pos}")
        print(f"Goal is at: {self.goal_pos}")
        print(f"Obstacles at: {self.obstacles}")
        self.print_grid()
        print()

    def get_state(self):
        return tuple(self.agent_pos)

    def get_possible_actions(self):
        return ['up', 'down', 'left', 'right']

    def step(self, action):
        print(f"Environment processing action: {action}")

        # Calculate new position based on action
        new_pos = self.agent_pos.copy()
        if action == 'up' and new_pos[0] > 0:
            new_pos[0] -= 1
        elif action == 'down' and new_pos[0] < self.grid_size - 1:
            new_pos[0] += 1
        elif action == 'left' and new_pos[1] > 0:
            new_pos[1] -= 1
        elif action == 'right' and new_pos[1] < self.grid_size - 1:
            new_pos[1] += 1

        # Check if new position is valid (not an obstacle)
        if new_pos not in self.obstacles:
            self.agent_pos = new_pos

        # Calculate reward
        reward = self.calculate_reward()

        # Check if episode is done
        done = (self.agent_pos == self.goal_pos)

        new_state = self.get_state()
        print(f"New state: {new_state}, Reward: {reward}, Done: {done}")

        return new_state, reward, done

    def calculate_reward(self):
        if self.agent_pos == self.goal_pos:
            return 100  # Big positive reward for reaching goal
        elif self.agent_pos in self.obstacles:
            return -10  # Negative reward for hitting obstacle
        else:
            return -1   # Small negative reward for each step (encourages efficiency)

    def print_grid(self):
        for i in range(self.grid_size):
            row = ""
            for j in range(self.grid_size):
                if [i, j] == self.agent_pos:
                    row += "A "  # Agent
                elif [i, j] == self.goal_pos:
                    row += "G "  # Goal
                elif [i, j] in self.obstacles:
                    row += "X "  # Obstacle
                else:
                    row += ". "  # Empty space
            print(row)
        print()

In [10]:
class SimpleAgent:
    def __init__(self, name="Simple Agent"):
        self.name = name
        self.total_reward = 0
        print(f"=== AGENT: {self.name} ===")
        print("Agent initialized and ready to learn!")

    def choose_action(self, state, possible_actions):
        action = random.choice(possible_actions)
        print(f"Agent choosing action: {action}")
        return action

    def learn(self, state, action, reward, next_state):
        self.total_reward += reward
        print(f"Agent learned: State {state} → Action {action} → Reward {reward}")
        print(f"Total reward so far: {self.total_reward}")

In [11]:
# Create environment and agent
env = SimpleEnvironment()
agent = SimpleAgent("Explorer")

print("=== STATES ===")
print("States represent the current situation/configuration")
current_state = env.get_state()
print(f"Current state: {current_state}")
print("In our grid world, state = agent's position [row, col]")
print()

print("=== ACTIONS ===")
print("Actions are choices the agent can make")
possible_actions = env.get_possible_actions()
print(f"Possible actions: {possible_actions}")
print("Each action moves the agent in a direction")
print()

# Simulate a few steps
print("=== AGENT-ENVIRONMENT INTERACTION ===")
for step in range(3):
    print(f"\n--- Step {step + 1} ---")
    state = env.get_state()
    action = agent.choose_action(state, possible_actions)
    next_state, reward, done = env.step(action)
    agent.learn(state, action, reward, next_state)
    env.print_grid()

    if done:
        print("Episode finished!")
        break

print("\n=== RETURNS ===")
print("Returns = Total accumulated reward over time")
print(f"Agent's total return: {agent.total_reward}")
print("Goal: Learn to maximize expected returns")
print()

=== ENVIRONMENT SETUP ===
Grid size: 4x4
Agent starts at: [0, 0]
Goal is at: [3, 3]
Obstacles at: [[1, 1], [2, 1]]
A . . . 
. X . . 
. X . . 
. . . G 


=== AGENT: Explorer ===
Agent initialized and ready to learn!
=== STATES ===
States represent the current situation/configuration
Current state: (0, 0)
In our grid world, state = agent's position [row, col]

=== ACTIONS ===
Actions are choices the agent can make
Possible actions: ['up', 'down', 'left', 'right']
Each action moves the agent in a direction

=== AGENT-ENVIRONMENT INTERACTION ===

--- Step 1 ---
Agent choosing action: up
Environment processing action: up
New state: (0, 0), Reward: -1, Done: False
Agent learned: State (0, 0) → Action up → Reward -1
Total reward so far: -1
A . . . 
. X . . 
. X . . 
. . . G 


--- Step 2 ---
Agent choosing action: right
Environment processing action: right
New state: (0, 1), Reward: -1, Done: False
Agent learned: State (0, 0) → Action right → Reward -1
Total reward so far: -2
. A . . 
. X . .

#### value-based method: Q-Learning

In [12]:
class QLearningAgent:

    def __init__(self, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.name = "Q-Learning Agent"
        self.q_table = defaultdict(lambda: defaultdict(float))  # Q(state, action)
        self.learning_rate = learning_rate  # How fast to learn
        self.discount_factor = discount_factor  # How much to value future rewards
        self.epsilon = epsilon  # Exploration rate
        self.total_reward = 0

        print(f"=== {self.name} ===")
        print("Learns by updating Q-values: Q(s,a) = expected return")
        print(f"Learning rate: {learning_rate}")
        print(f"Discount factor: {discount_factor}")
        print(f"Exploration rate: {epsilon}")
        print()

    def choose_action(self, state, possible_actions):
        if random.random() < self.epsilon:
            # Explore: random action
            action = random.choice(possible_actions)
            print(f"Exploring: random action {action}")
        else:
            # Exploit: best known action
            q_values = {action: self.q_table[state][action] for action in possible_actions}
            action = max(q_values, key=q_values.get)
            print(f"Exploiting: best action {action} (Q-value: {q_values[action]:.2f})")

        return action

    def learn(self, state, action, reward, next_state, possible_actions):

        self.total_reward += reward

        # Current Q-value
        current_q = self.q_table[state][action]

        # Best Q-value for next state
        if next_state:
            max_next_q = max([self.q_table[next_state][a] for a in possible_actions])
        else:
            max_next_q = 0  # Terminal state

        # Q-learning update
        target = reward + self.discount_factor * max_next_q
        new_q = current_q + self.learning_rate * (target - current_q)

        self.q_table[state][action] = new_q

        print(f"Q-Update: Q({state},{action}) = {current_q:.2f} → {new_q:.2f}")
        print(f"  Target value: {target:.2f}")

    def show_q_table(self):
        print("=== LEARNED Q-TABLE ===")
        for state in sorted(self.q_table.keys()):
            print(f"State {state}:")
            for action, q_val in self.q_table[state].items():
                print(f"  {action}: {q_val:.2f}")

#### Policy-based method: Policy gradient

In [13]:
class PolicyGradientAgent:
    def __init__(self, learning_rate=0.01):
        self.name = "Policy Gradient Agent"
        self.policy_params = defaultdict(lambda: defaultdict(float))  # Policy parameters
        self.learning_rate = learning_rate
        self.episode_history = []  # Store episode for learning
        self.total_reward = 0

        print(f"=== {self.name} ===")
        print("Learns policy directly: π(a|s) = probability of action")
        print(f"Learning rate: {learning_rate}")
        print()

    def get_action_probabilities(self, state, possible_actions):
        # Get raw scores for each action
        scores = [self.policy_params[state][action] for action in possible_actions]

        # Softmax to convert to probabilities
        exp_scores = [np.exp(score) for score in scores]
        total = sum(exp_scores)
        probabilities = [exp_score/total for exp_score in exp_scores]

        return dict(zip(possible_actions, probabilities))

    def choose_action(self, state, possible_actions):
        action_probs = self.get_action_probabilities(state, possible_actions)

        # Sample action based on probabilities
        actions = list(action_probs.keys())
        probs = list(action_probs.values())
        action = np.random.choice(actions, p=probs)

        print(f"Policy probabilities: {action_probs}")
        print(f"Sampled action: {action}")

        return action

    def store_experience(self, state, action, reward):
        """Store experience for end-of-episode learning"""
        self.episode_history.append((state, action, reward))
        self.total_reward += reward

    def learn_from_episode(self, possible_actions):
        print("=== POLICY GRADIENT UPDATE ===")

        # Calculate returns for each step
        returns = []
        total_return = 0
        for state, action, reward in reversed(self.episode_history):
            total_return += reward
            returns.append(total_return)
        returns.reverse()

        # Update policy parameters
        for i, (state, action, reward) in enumerate(self.episode_history):
            return_val = returns[i]

            # Policy gradient update: increase probability of good actions
            if return_val > 0:
                self.policy_params[state][action] += self.learning_rate * return_val
                print(f"Increasing probability of {action} in state {state}")
            else:
                self.policy_params[state][action] -= self.learning_rate * abs(return_val)
                print(f"Decreasing probability of {action} in state {state}")

        # Clear episode history
        self.episode_history = []

    def show_policy(self, possible_actions):
        print("=== LEARNED POLICY ===")
        for state in sorted(self.policy_params.keys()):
            probs = self.get_action_probabilities(state, possible_actions)
            print(f"State {state}: {probs}")

#### compare both method in practise

In [14]:
# Reset environment
env = SimpleEnvironment()
env.agent_pos = [0, 0]  # Reset agent position

print("\n--- Q-Learning Agent Training ---")
q_agent = QLearningAgent()

# Train Q-learning agent for a few steps
for step in range(5):
    state = env.get_state()
    action = q_agent.choose_action(state, env.get_possible_actions())
    next_state, reward, done = env.step(action)
    q_agent.learn(state, action, reward, next_state, env.get_possible_actions())

    if done:
        break

q_agent.show_q_table()

print("\n" + "="*50)

# Reset environment for policy gradient agent
env.agent_pos = [0, 0]

print("\n--- Policy Gradient Agent Training ---")
pg_agent = PolicyGradientAgent()

# Train policy gradient agent for one episode
episode_steps = 0
while episode_steps < 5:
    state = env.get_state()
    action = pg_agent.choose_action(state, env.get_possible_actions())
    next_state, reward, done = env.step(action)
    pg_agent.store_experience(state, action, reward)
    episode_steps += 1

    if done:
        break

# Learn from the episode
pg_agent.learn_from_episode(env.get_possible_actions())
pg_agent.show_policy(env.get_possible_actions())

=== ENVIRONMENT SETUP ===
Grid size: 4x4
Agent starts at: [0, 0]
Goal is at: [3, 3]
Obstacles at: [[1, 1], [2, 1]]
A . . . 
. X . . 
. X . . 
. . . G 



--- Q-Learning Agent Training ---
=== Q-Learning Agent ===
Learns by updating Q-values: Q(s,a) = expected return
Learning rate: 0.1
Discount factor: 0.9
Exploration rate: 0.1

Exploiting: best action up (Q-value: 0.00)
Environment processing action: up
New state: (0, 0), Reward: -1, Done: False
Q-Update: Q((0, 0),up) = 0.00 → -0.10
  Target value: -1.00
Exploiting: best action down (Q-value: 0.00)
Environment processing action: down
New state: (1, 0), Reward: -1, Done: False
Q-Update: Q((0, 0),down) = 0.00 → -0.10
  Target value: -1.00
Exploiting: best action up (Q-value: 0.00)
Environment processing action: up
New state: (0, 0), Reward: -1, Done: False
Q-Update: Q((1, 0),up) = 0.00 → -0.10
  Target value: -1.00
Exploiting: best action left (Q-value: 0.00)
Environment processing action: left
New state: (0, 0), Reward: -1, Done: False
