In [None]:
import warnings
warnings.filterwarnings('ignore')

# module13_exercise3 : Deep Q Learning

### Run in collab
<a href="https://colab.research.google.com/github/racousin/data_science_practice/blob/master/website/public/modules/module13/exercise/module13_exercise3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install swig==4.2.1
!pip install gymnasium==0.29.1

### Objective
In order to tackle difficult problems (large action-state space and complexity), we will use deep Q learning.

**Complete the TODO steps! Good luck!**

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from time import time,sleep
from collections import deque
import gymnasium as gym
sns.set_style("darkgrid")

In [None]:
# We will experiment our algo with CartPole - https://gymnasium.farama.org/environments/classic_control/cart_pole/
env = gym.make('CartPole-v1')

# Deep Q-learning

We will parametrize The Q function.
In other words, we are looking for $\theta \in \mathbb{R}^d$ such as 
$\forall s, Q_\theta(s,a) = \mathbb{E}_\pi[G_t | S_t = s, A_t = a]$. We follow the same idea as q-learning:
we learn and update $Q_\theta(S_t,A_t)$ using the target $R_{t+1}+\gamma \max_a Q_\theta(S_{t+1},a)$. A natural loss is the mean square error:

$L(\theta) = \mathbb{E}_{s,a\sim Q} [(y - Q(s,a,\theta))^2]$



$y = R_{t+1} + \gamma \max_a Q(S_{t+1},a,\theta)$

We have 2 ways to write our function:
1. $Q_\theta : S\times A \rightarrow \mathbb{R}$

in this case greedy policy looks like $\pi(.|s) = \arg\max([Q_\theta(s,a_0), Q_\theta(s,a_1),... Q_\theta(s,a_{dim(A)}]) $

The target is $y = R_{t+1} + \gamma \max_a Q(S_{t+1},a,\theta)$


2. $Q_\theta : S \rightarrow \mathbb{R}^{dim(A)}$

in this case greedy policy looks like $\pi(.|s) = \arg\max(Q_\theta(s))$

The target is $y_i = R_{t+1} + \gamma \max_a Q(S_{t+1},a,\theta)$ for i corresponding to the played action, $Q_\theta(s_t)_i$ otherwise.

In other words, if we played $a$ (second action) in $s$, and we obseved $r$ and $s'$, our target will be (assuming we have 3 actions):

$\begin{aligned}
y_0 =& Q(s,a,\theta)_0\\
y_1 =&R_{t+1} + \gamma \max_a Q(S_{t+1},a,\theta)\\
y_2 =&Q(s,a,\theta)_2
\end{aligned}$

And our loss:

$L(\theta) = (R_{t+1} + \gamma \max_a Q(S_{t+1},a,\theta) - Q(s,a,\theta)_1)^2$

In practice implementation 2 is often easier to implement. So it is what we will do!

In [None]:
#TODO: write a torch model that represent our parametrized Q function
# We should be able to run Q.predict([s]) and it should return [[Q(s,a_0), Q(s,a_1) .. Q(s,q_m)]] where m is action size (case 2)
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class QNetwork(nn.Module):
    """
    A simple Q-Network implementation using PyTorch.
    Represents the parametrized Q-function with the same architecture as the Keras model.
    """
    def __init__(self, state_dim: int, action_dim: int):
        """
        Initialize the Q-Network.
        
        Args:
            state_dim (int): Dimension of the state space
            action_dim (int): Dimension of the action space
        """
        super(QNetwork, self).__init__()
        
        self.network = nn.Sequential(
            # TODO
        )
        
        # Initialize the optimizer with the same learning rate as in the Keras version
        self.optimizer = optim.Adam(self.parameters(), lr=1e-2)
        
    def forward(self, state: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the network.
        
        Args:
            state (torch.Tensor): Input state tensor
            
        Returns:
            torch.Tensor: Q-values for each action
        """
        return # TODO
    
    def predict(self, state: np.ndarray) -> np.ndarray:
        """
        Predict Q-values for a given state (numpy interface for compatibility).
        
        Args:
            state (np.ndarray): Input state as numpy array
            
        Returns:
            np.ndarray: Q-values for each action as numpy array
        """
        # Convert numpy array to torch tensor
        state_tensor = torch.FloatTensor(state)
        
        # Set model to evaluation mode and disable gradients for prediction
        self.eval()
        with torch.no_grad():
            q_values = self.forward(state_tensor)
        
        # Convert back to numpy and return
        return q_values.numpy()

### TODO 0 : write deep Q learning interaction with the environment

In [None]:
#TODO: Complete our Deep Q learning agent write the action choosen by our Q learning algorithm.
# It should be a = argmax(Q(s)) with proba 1 - epsilon
class DeepQAgent():
    def __init__(self, env, model: nn.Module, gamma = .99, epsilon = .1):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.model = QNetwork(self.state_dim, self.action_dim)
    
    def choose_action(self, state: np.ndarray) -> int:
        """
        Choose an action using epsilon-greedy policy.
        
        Args:
            state (np.ndarray): Current state observation
            
        Returns:
            int: Selected action
        """
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.env.action_space.n)
        else:
            state_tensor = torch.FloatTensor(state.reshape(1, -1))
            with torch.no_grad():
                q_values = self.model(state_tensor)
            return  # TODO

In [None]:
def run_experiment_episode(env, agent, nb_episode, train=False):
    rewards = np.zeros(nb_episode)
    for i in range(nb_episode):
        state, _ = env.reset()
        done = False
        rews = []
        while done is False:
            action = agent.choose_action(state)
            current_state = state
            state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            rews.append(reward)
            if train:
                agent.train(current_state, action, reward, state, done)
        rewards[i] = sum(rews)
        print('episode: {} - cum reward {}'.format(i, rewards[i]))
    return rewards

In [None]:
#interact with the environment through episode and display the return
model1 = QNetwork(env.observation_space.shape[0], env.action_space.n)
random_q_agent = DeepQAgent(env, model1)
rewards = run_experiment_episode(env, random_q_agent, 20)
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards,'+')
ax.set_title('cumulative reward per episode - naive_q_agent')

### TODO 1) : write deep Q learning update

In [None]:
#Done: write deep Q learning update
class DeepQAgent():
    def __init__(self, env, model: nn.Module, gamma = .99, epsilon = .1):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.model = model 
        # Initialize loss function and optimizer
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-2)
    
    def choose_action(self, state: np.ndarray) -> int:
        """
        Choose an action using epsilon-greedy policy.
        
        Args:
            state (np.ndarray): Current state observation
            
        Returns:
            int: Selected action
        """
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.env.action_space.n)
        else:
            state_tensor = torch.FloatTensor(state.reshape(1, -1))
            with torch.no_grad():
                q_values = self.model(state_tensor)
            return q_values.argmax().item()
    
    def train(self, current_state, action, reward, next_state, done):
        """
        Train the Q-network on a single transition.
        
        Args:
            current_state: The current state
            action: The action taken
            reward: The reward received
            next_state: The next state
            done: Whether the episode is done
        """
        # Convert inputs to tensors and ensure proper shapes
        current_state = np.array(current_state, dtype=np.float32)
        next_state = np.array(next_state, dtype=np.float32)
        
        if len(current_state.shape) == 1:
            current_state = current_state.reshape(1, -1)
        if len(next_state.shape) == 1:
            next_state = next_state.reshape(1, -1)
            
        current_state_tensor = torch.FloatTensor(current_state)
        next_state_tensor = torch.FloatTensor(next_state)
        action_tensor = torch.LongTensor([action])
        reward_tensor = torch.FloatTensor([reward])
        done_tensor = torch.FloatTensor([done])

        # Compute current Q-value
        self.model.train()  # Set to training mode
        current_q_values = self.model(current_state_tensor)
        current_q_value = current_q_values.gather(1, action_tensor.unsqueeze(1)).squeeze(1)

        # Compute next Q-value
        with torch.no_grad():
            next_q_values = # TODO
            max_next_q_value = # TODO

        # Compute target Q-value
        target_q_value = # TODO

        # Compute loss and update weights
        loss = self.criterion(current_q_value, target_q_value)
        
        # Zero gradients, perform backward pass, and update weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

In [None]:
#train it and display learning using run_experiment_episode_train(env, q_agent, nb_episode) it
model1 = QNetwork(env.observation_space.shape[0], env.action_space.n)
q_agent = DeepQAgent(env, model1)
rewards = run_experiment_episode(env, q_agent, 500, train=True)
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards,'+')
ax.set_title('cumulative reward per episode - deep_q_agent')

### TODO 2) : Try different hyerparamters models (number of layers, nodes) and compare learning

In [None]:
### TODO 2) : Try different hyerparamters models (number of layers, nodes, activation) and compare learning, create another QNetwork

In [None]:
#TODO: write a torch model that represent our parametrized Q function
# We should be able to run Q.predict([s]) and it should return [[Q(s,a_0), Q(s,a_1) .. Q(s,q_m)]] where m is action size (case 2)
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class QNetwork2(nn.Module):
    def __init__(self, state_dim: int, action_dim: int):
        super(QNetwork2, self).__init__()
        
        self.network = nn.Sequential(
            # TODO
        )
        
        # Initialize the optimizer with the same learning rate as in the Keras version
        self.optimizer = optim.Adam(self.parameters(), lr=1e-2)
        
    def forward(self, state: torch.Tensor) -> torch.Tensor:
        return self.network(state)
    
    def predict(self, state: np.ndarray) -> np.ndarray:
        # Convert numpy array to torch tensor
        state_tensor = torch.FloatTensor(state)
        
        # Set model to evaluation mode and disable gradients for prediction
        self.eval()
        with torch.no_grad():
            q_values = self.forward(state_tensor)
        
        # Convert back to numpy and return
        return q_values.numpy()

In [None]:
#TODO: train it and display learning using run_experiment_episode_train(env, q_agent, nb_episode) it
model2 = QNetwork2(env.observation_space.shape[0], env.action_space.n)
q_agent = DeepQAgent(env, model2)
rewards2 = run_experiment_episode(env, q_agent, 500, train=True)
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards,label='initial_model')
ax.plot(rewards2,label='your_model')
ax.set_title('cumulative reward per episode - deep_q_agent')
ax.legend()

### Experience replay

In order to improve stability, we will keep memory of the previous moves and use it to update our model


$L_i(\theta_i) = \mathbb{E}_{(s, a, r, s') \sim U(D)} \left[ \left(r + \gamma \max_{a'} Q(s', a'; \theta_i^-) - Q(s, a; \theta_i)\right)^2 \right]$


#### Algorithm:

**Initialize:**
- Q-network Q(s,a;θ) with random weights θ
- Replay memory D with capacity N
- Minibatch size B
- Discount factor γ
- Exploration rate ε

**For** each episode:
1. Initialize state s₁

2. **For** each step t:
    - **With probability ε:**
        - Choose random action aₜ
    - **Otherwise:**
        - aₜ = argmax_a Q(sₜ,a;θ)
    
    - Execute aₜ, observe rₜ, sₜ₊₁
    - Store (sₜ,aₜ,rₜ,sₜ₊₁) in D
    
    - **If** |D| >= B:
        - Sample random minibatch (sⱼ,aⱼ,rⱼ,sⱼ₊₁) from D
        - **For** each j in minibatch:
            - yⱼ = rⱼ + γ max_a' Q(sⱼ₊₁,a';θ)
        - Update θ by minimizing Σ(yⱼ - Q(sⱼ,aⱼ;θ))²
    
    - sₜ = sₜ₊₁
    - **If** sₜ is terminal: break

3. Optionally decay ε

#### Key Equations:
- **Target**: yⱼ = rⱼ + γ max_a' Q(sⱼ₊₁,a';θ)
- **Loss**: L(θ) = Σ(yⱼ - Q(sⱼ,aⱼ;θ))²

### TODO 3) : Try different hyerparamters models (number of layers, nodes) and compare learning

In [None]:
#TODO: write The function replay that return bacth from memory
# self.memory is a queue of size memory_size
# (x_batch, y_batch)_i is a random (state, target) from the memory

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

class DeepQAgent_experience_replay():
    def __init__(self, env, model: nn.Module, gamma=.99, epsilon=.1, memory_size=2000, batch_size=100):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        
        self.model = model
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)
        
        # Initialize optimizer and loss function
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-2)
        self.criterion = nn.MSELoss()
    
    def replay(self, batch_size):
        # Sample batch from memory
        minibatch = random.sample(#TODO)
        batch = np.array(minibatch, dtype=object)
        
        # Extract and convert states to tensors
        states = torch.FloatTensor(np.vstack(batch[:,0]))
        actions = torch.LongTensor(batch[:,1].astype(int))
        rewards = torch.FloatTensor(batch[:,2])
        next_states = torch.FloatTensor(np.vstack(batch[:,3]))
        dones = torch.FloatTensor(batch[:,4])
        
        # Get current Q values
        current_q_values = self.model(states)
        current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # Compute next Q values
        with torch.no_grad():
            next_q_values = self.model(next_states)
            max_next_q_values = next_q_values.max(1)[0]
            
        # Compute target Q values
        target_q_values = rewards + (1 - dones) * self.gamma * max_next_q_values
        
        return current_q_values, target_q_values
    
    def choose_action(self, state: np.ndarray) -> int:
        """
        Choose an action using epsilon-greedy policy.
        
        Args:
            state (np.ndarray): Current state observation
            
        Returns:
            int: Selected action
        """
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.env.action_space.n)
        else:
            state_tensor = torch.FloatTensor(state.reshape(1, -1))
            with torch.no_grad():
                q_values = self.model(state_tensor)
            return q_values.argmax().item()
    
    def train(self, current_state, action, reward, next_state, done):
        # Store experience in memory
        self.memory.append([#TODO])
        
        # Only train if we have enough samples
        if len(self.memory) < self.batch_size:
            return 0.0
        
        # Get batch of experiences
        current_q_values, target_q_values = self.replay(self.batch_size)
        
        # Compute loss and update weights
        self.optimizer.zero_grad()
        loss = self.criterion(current_q_values, target_q_values)
        loss.backward()
        self.optimizer.step()
        
        return loss.item()

In [None]:
#TODO: train it and display learning using run_experiment_episode_train(env, q_agent, nb_episode) it
model2 = QNetwork2(env.observation_space.shape[0], env.action_space.n)
q_agent_replay = DeepQAgent_experience_replay(env, model2)
rewards2_replay = run_experiment_episode(env, q_agent, 500, train=True)


In [None]:
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards2,label='your_model')
ax.plot(rewards2_replay,label='your_model_replay')
ax.set_title('cumulative reward per episode - deep_q_agent')
ax.legend()

# Other improvments

### epsilon decay
Decay how random you take an action

In [None]:
class DeepQAgent_epsilon_decay():
    def __init__(self, env, model: nn.Module, gamma=.99, epsilon=0.5, 
                 epsilon_min=0, epsilon_decay=0.995, memory_size=2000, batch_size=100):

        self.epsilon = epsilon  # Start with high exploration
        self.epsilon_min = epsilon_min  # Minimum exploration rate
        self.epsilon_decay = epsilon_decay  # Decay rate

    
    def decay_epsilon(self):
        """Decay epsilon after each episode, but not below epsilon_min"""
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def train(self, current_state, action, reward, next_state, done):
        self.decay_epsilon()

### Target Network

## **1. The Problem Without a Target Network**  
In standard Q-learning, we update the Q-values using the Bellman equation:  

$$
Q(s, a) \leftarrow r + \gamma \max_{a'} Q(s', a')
$$

where:  
- $Q(s, a)$ is the current estimate of the Q-value.  
- $r$ is the reward.  
- $\gamma$ is the discount factor.  
- $\max_{a'} Q(s', a')$ is the estimated future reward using the same network.  

### **Why is this a problem?**  
- The network **learns from itself** since it's using its own changing estimates to guide learning.  
- When updating the Q-values, **both the current estimate and the target come from the same network**, which leads to **correlations** in the updates.  
- Small changes in the Q-network can **drastically affect the target values**, leading to **divergence** or unstable learning.  

---

## **2. Solution: The Target Network**  
To stabilize training, **a separate target network $Q_{\text{target}}$ is introduced**. The idea is simple:  
- Instead of using the same network to compute both $Q(s, a)$ and $\max Q(s', a')$, we use a fixed (or slowly updated) copy of the network for the targets.  
- The update rule becomes:

$$
Q(s, a) \leftarrow r + \gamma \max_{a'} Q_{\text{target}}(s', a')
$$

- The target network $Q_{\text{target}}$ is a copy of the main Q-network but **updated less frequently**.  

### **Implementation Details**  
1. **Copy the weights periodically**  
   - Every $N$ steps, set:

$$
\theta_{\text{target}} \leftarrow \theta_{\text{main}}
$$
   
   - This ensures that $Q_{\text{target}}$ is a stable reference for several updates.  

2. **Or use a soft update**  
   - Instead of copying completely, update gradually:

$$
\theta_{\text{target}} \leftarrow \tau \theta_{\text{main}} + (1 - \tau) \theta_{\text{target}}
$$

   - Where $\tau$ (e.g., 0.001) is a small update rate.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

class DeepQAgent_experience_replay():
    def __init__(self, env, model: nn.Module, gamma=0.99, 
                 epsilon=0.1, epsilon_min=0.01, epsilon_decay=0.995,
                 memory_size=2000, batch_size=100, target_update_freq=100):
        # Environment and learning parameters
        self.env = env
        self.gamma = gamma
        
        # Epsilon parameters for exploration
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        
        # Main network
        self.model = model
        
        # Target network
        self.target_model = type(model)(env.observation_space.shape[0], 
                                      env.action_space.n)
        self.update_target_network()  # Initial copy
        
        # Memory parameters
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)
        
        # Target network update parameters
        self.target_update_freq = target_update_freq
        self.steps = 0
        
        # Optimizer and loss
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-2)
        self.criterion = nn.MSELoss()
    
    def update_target_network(self):
        """Copy weights from main network to target network"""
        self.target_model.load_state_dict(self.model.state_dict())
    
    def decay_epsilon(self):
        """Decay epsilon with a minimum value"""
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def replay(self, batch_size):
        """Sample batch from memory and compute target Q-values"""
        # Sample batch from memory
        minibatch = random.sample(self.memory, min(len(self.memory), batch_size))
        
        # Convert batch elements to numpy arrays separately to avoid dtype issues
        states = np.array([transition[0] for transition in minibatch])
        actions = np.array([transition[1] for transition in minibatch])
        rewards = np.array([transition[2] for transition in minibatch], dtype=np.float32)
        next_states = np.array([transition[3] for transition in minibatch])
        dones = np.array([transition[4] for transition in minibatch], dtype=np.float32)
        
        # Convert to PyTorch tensors
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)
        
        # Get current Q values from main network
        current_q_values = self.model(states)
        current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # Compute next Q values from target network
        with torch.no_grad():
            next_q_values = self.target_model(next_states)
            max_next_q_values = next_q_values.max(1)[0]
            
        # Compute target Q values with bellman equation
        target_q_values = rewards + (1 - dones) * self.gamma * max_next_q_values
        
        return current_q_values, target_q_values
    
    def choose_action(self, state):
        """Choose action using epsilon-greedy policy"""
        # Exploration
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.env.action_space.n)
        
        # Exploitation: get action from Q-network
        state_tensor = torch.FloatTensor(state.reshape(1, -1))
        with torch.no_grad():
            q_values = self.model(state_tensor)
        return q_values.argmax().item()
    
    def train(self, current_state, action, reward, next_state, done):
        self.decay_epsilon()
        """Train the network on a single transition"""
        # Store experience in memory
        self.memory.append([current_state, action, reward, next_state, done])
        
        # Only train if we have enough samples
        if len(self.memory) < self.batch_size:
            return 0.0
        
        # Get batch of experiences
        current_q_values, target_q_values = self.replay(self.batch_size)
        
        # Compute loss and update main network
        self.optimizer.zero_grad()
        loss = self.criterion(current_q_values, target_q_values)
        loss.backward()
        self.optimizer.step()
        
        # Update target network periodically
        self.steps += 1
        if self.steps % self.target_update_freq == 0:
            self.update_target_network()
        
        return loss.item()

In [None]:
#TODO: train it and display learning using run_experiment_episode_train(env, q_agent, nb_episode) it
model2 = QNetwork2(env.observation_space.shape[0], env.action_space.n)
q_agent_improved = DeepQAgent_experience_replay(env, model2)
rewards2_improved = run_experiment_episode(env, q_agent_improved, 500, train=True)


In [None]:
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards2_replay,label='your_model_replay')
ax.plot(rewards2_improved,label='your_model_improved')
ax.set_title('cumulative reward per episode - deep_q_agent')
ax.legend()