<a href="https://colab.research.google.com/github/sasya05/Reinforcement-learning/blob/main/lab-6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# =========================================================
# ✅ Deep Q-Network (DQN) Implementation — Final Version
# Works with Python 3.12, NumPy ≥ 2.0, Gymnasium, and PyTorch
# =========================================================

import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_  # Compatibility for NumPy ≥ 2.0

import random
import math
import time
from collections import deque, namedtuple
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# ------------------------------
# DQN Network
# ------------------------------
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# ------------------------------
# Replay Buffer
# ------------------------------
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# ------------------------------
# Agent
# ------------------------------
class DQNAgent:
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99, batch_size=64, eps_start=1.0, eps_end=0.01, eps_decay=500):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.batch_size = batch_size
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        self.policy_net = DQN(state_dim, action_dim)
        self.target_net = DQN(state_dim, action_dim)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)

        self.memory = ReplayBuffer()
        self.steps_done = 0

    def select_action(self, state):
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * math.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        if random.random() < eps_threshold:
            return torch.tensor([[random.randrange(self.action_dim)]], dtype=torch.long)
        else:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = torch.cat(batch.next_state)
        done_batch = torch.cat(batch.done)

        q_values = self.policy_net(state_batch).gather(1, action_batch)
        next_q_values = self.target_net(next_state_batch).max(1)[0].detach()
        expected_q_values = reward_batch + (self.gamma * next_q_values * (1 - done_batch))

        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# ------------------------------
# Training Loop
# ------------------------------
def train(env_name="CartPole-v1", num_episodes=500):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = DQNAgent(state_dim, action_dim)
    rewards = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        state = torch.tensor([state], dtype=torch.float32)
        total_reward = 0

        for t in range(500):
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            total_reward += reward

            next_state_tensor = torch.tensor([next_state], dtype=torch.float32)
            reward_tensor = torch.tensor([reward], dtype=torch.float32)
            done_tensor = torch.tensor([float(done)], dtype=torch.float32)

            agent.memory.push(state, action, reward_tensor, next_state_tensor, done_tensor)
            state = next_state_tensor

            agent.optimize()

            if done:
                break

        rewards.append(total_reward)
        agent.update_target()

        if episode % 10 == 0:
            print(f"Episode {episode}, Reward: {total_reward:.1f}")

    env.close()
    torch.save(agent.policy_net.state_dict(), "dqn_policy.pth")
    print("✅ Training complete! Model saved as dqn_policy.pth")
    return agent, rewards

if __name__ == "__main__":
    start = time.time()
    agent, rewards = train(num_episodes=200)
    print("Training done in", round(time.time() - start, 2), "seconds")


  state = torch.tensor([state], dtype=torch.float32)


Episode 0, Reward: 16.0


  return datetime.utcnow().replace(tzinfo=utc)


Episode 10, Reward: 13.0
Episode 20, Reward: 11.0
Episode 30, Reward: 10.0
Episode 40, Reward: 10.0
Episode 50, Reward: 11.0
Episode 60, Reward: 8.0
Episode 70, Reward: 12.0
Episode 80, Reward: 8.0
Episode 90, Reward: 10.0
Episode 100, Reward: 8.0
Episode 110, Reward: 10.0
Episode 120, Reward: 10.0
Episode 130, Reward: 9.0
Episode 140, Reward: 9.0
Episode 150, Reward: 10.0
Episode 160, Reward: 10.0
Episode 170, Reward: 10.0
Episode 180, Reward: 9.0
Episode 190, Reward: 10.0
✅ Training complete! Model saved as dqn_policy.pth
Training done in 7.73 seconds
