In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

# Define the Q-Network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        # Define the network architecture: 2 hidden layers
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)  # Output the Q-values for each action

# Define the Replay Memory
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, transition):
        self.memory.append(transition)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# Define the DQN Agent
class DQNAgent:
    def __init__(self, state_size, action_size, memory_size=10000, batch_size=64, gamma=0.99, lr=0.001):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayMemory(memory_size)
        self.batch_size = batch_size
        self.gamma = gamma  # Discount factor
        self.lr = lr  # Learning rate

        # Create Q-Network and Target Network
        self.q_network = QNetwork(state_size, action_size)
        self.target_network = QNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=self.lr)

        # Set the target network's parameters to match the Q-network initially
        self.update_target_network()

        # Epsilon-greedy parameters
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01

    def update_target_network(self):
        # Copy the weights from the Q-network to the target network
        self.target_network.load_state_dict(self.q_network.state_dict())

    def remember(self, state, action, reward, next_state, done):
        # Store the experience in memory
        self.memory.push((state, action, reward, next_state, done))

    def act(self, state):
        # Epsilon-greedy action selection
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)  # Random action (exploration)
        else:
            state = torch.FloatTensor(state).unsqueeze(0)  # Convert state to tensor
            q_values = self.q_network(state)  # Predict Q-values
            return np.argmax(q_values.detach().numpy())  # Choose the action with highest Q-value

    def replay(self):
        # Check if we have enough experience in memory
        if len(self.memory) < self.batch_size:
            return

        # Sample a batch of experiences from memory
        batch = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        # Convert to torch tensors
        states = torch.FloatTensor(np.vstack(states))
        actions = torch.LongTensor(actions).unsqueeze(1)
        rewards = torch.FloatTensor(rewards).unsqueeze(1)
        next_states = torch.FloatTensor(np.vstack(next_states))
        dones = torch.FloatTensor(dones).unsqueeze(1)

        # Compute current Q-values
        q_values = self.q_network(states).gather(1, actions)

        # Compute target Q-values
        with torch.no_grad():
            max_next_q_values = self.target_network(next_states).max(1)[0].unsqueeze(1)
            target_q_values = rewards + (self.gamma * max_next_q_values * (1 - dones))

        # Compute the loss between current Q-values and target Q-values
        loss = F.mse_loss(q_values, target_q_values)

        # Perform a gradient descent step
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Decay epsilon (exploration-exploitation tradeoff)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Define the environment and training loop (a sample setup, e.g., using OpenAI Gym's CartPole)
import gym
env = gym.make("CartPole-v1")

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Create DQN agent
agent = DQNAgent(state_size, action_size)

episodes = 500

for e in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        # Store the experience in memory
        agent.remember(state, action, reward, next_state, done)

        # Train the agent with a batch from memory
        agent.replay()

        state = next_state

    # Update the target network after every episode
    agent.update_target_network()

    print(f"Episode: {e+1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")
