<a href="https://colab.research.google.com/github/sharjeelanjum/Deeplearning_with_pytorch/blob/main/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from collections import deque

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)
        return state, action, reward, next_state, done

class DQNAgent:
    def __init__(self, state_dim, action_dim, buffer_size, batch_size, gamma, lr):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.buffer = ReplayBuffer(buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.model = DQN(state_dim, action_dim).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def select_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randint(0, self.action_dim-1)
        else:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            with torch.no_grad():
                q_values = self.model(state)
            action = q_values.argmax().item()
            return action

    def train(self):
        if len(self.buffer.buffer) < self.batch_size:
            return
        state, action, reward, next_state, done = self.buffer.sample(self.batch_size)
        state = torch.tensor(state, dtype=torch.float32).to(self.device)
        action = torch.tensor(action, dtype=torch.long).to(self.device)
        reward = torch.tensor(reward, dtype=torch.float32).to(self.device)
        next_state = torch.tensor(next_state, dtype=torch.float32).to(self.device)
        done = torch.tensor(done, dtype=torch.float32).to(self.device)

        q_values = self.model(state).gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_values = self.model(next_state).max(1)[0]
        expected_q_values = reward + self.gamma * next_q_values * (1 - done)
        loss = F.mse_loss(q_values, expected_q_values.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def add_to_buffer(self, state, action, reward, next_state, done):
        self.buffer.add(state, action, reward, next_state, done)



In [None]:
import random

class SimpleGame:
    def __init__(self):
        self.state_dim = 4
        self.action_dim = 2
        self.position = 0
        self.velocity = 0
        self.target_position = random.randint(1, 10)
        self.done = False

    def reset(self):
        self.position = 0
        self.velocity = 0
        self.target_position = random.randint(1, 10)
        self.done = False
        return self._get_state()

    def step(self, action):
        if action == 0:
            self.velocity -= 1
        elif action == 1:
            self.velocity += 1
        else:
            raise ValueError("Invalid action")

        self.velocity = max(min(self.velocity, 5), -5)
        self.position += self.velocity

        if self.position == self.target_position:
            reward = 1
            self.done = True
        elif abs(self.position - self.target_position) <= 2:
            reward = -0.1
        else:
            reward = 0

        if self.position < 0:
            self.position = 0
            self.velocity = 0
        elif self.position > 10:
            self.position = 10
            self.velocity = 0

        return self._get_state(), reward, self.done

    def _get_state(self):
        return [self.position, self.velocity, self.target_position, int(self.done)]


Various kinds of environments can be utilized but here I am using the simple 1D game where the agent must move to a target position.

The state consists of the agent's current position, velocity, target position, and a flag for game-over. The agent can either accelerate or decelerate. If the agent reaches the target position, it gets a reward of 1 and the game ends. A penalty of -0.1 is given if the agent gets close to the target position, and going out of bounds resets the agent's position to the nearest boundary. The game is an example of a continuous control problem where the agent must control its velocity to reach the target position.

In [None]:
env = SimpleGame()
agent = DQNAgent(state_dim=env.state_dim, action_dim=env.action_dim, buffer_size=10000, batch_size=32, gamma=0.99, lr=0.001)

for episode in range(1000):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.select_action(state, epsilon=0.1)
        next_state, reward, done = env.step(action)
        agent.add_to_buffer(state, action, reward, next_state, done)
        agent.train()
        state = next_state
        total_reward += reward

    print(f"Episode {episode}: Total reward = {total_reward}")
    # Save final model weights
torch.save(agent.model.state_dict(), 'trained_model.pt')


Episode 0: Total reward = -0.8000000000000005
Episode 1: Total reward = 0.6
Episode 2: Total reward = -8.099999999999984
Episode 3: Total reward = 0.6
Episode 4: Total reward = 0.9
Episode 5: Total reward = -2.800000000000002
Episode 6: Total reward = 0.5
Episode 7: Total reward = 0.9
Episode 8: Total reward = 0.9
Episode 9: Total reward = -2.2000000000000015
Episode 10: Total reward = 0.9
Episode 11: Total reward = -3.6999999999999993
Episode 12: Total reward = 0.8
Episode 13: Total reward = 0.6
Episode 14: Total reward = 0.9
Episode 15: Total reward = -2.700000000000002
Episode 16: Total reward = 0.5
Episode 17: Total reward = 1
Episode 18: Total reward = 0.9
Episode 19: Total reward = 0.30000000000000004
Episode 20: Total reward = 1
Episode 21: Total reward = -2.800000000000002
Episode 22: Total reward = -4.899999999999995
Episode 23: Total reward = 0.9
Episode 24: Total reward = -18.900000000000013
Episode 25: Total reward = -1.4000000000000008
Episode 26: Total reward = 0.20000000