In [3]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from collections import deque

In [4]:
# Define the Neural Network for DQN
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Hyperparameters
GAMMA = 0.99
LEARNING_RATE = 0.001
BATCH_SIZE = 32
EPSILON = 1.0  # Exploration rate
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.995
MEMORY_SIZE = 1000
EPISODES = 100

# Environment Setup
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

In [7]:
# Initialize Model
policy_net = DQN(state_size, action_size)
target_net = DQN(state_size, action_size)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()
memory = deque(maxlen=MEMORY_SIZE)

# Experience Replay Function
def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

def act(state):
    global EPSILON
    if random.uniform(0, 1) < EPSILON:
        return random.randrange(action_size)
    state = torch.FloatTensor(state).unsqueeze(0)
    with torch.no_grad():
        return torch.argmax(policy_net(state)).item()

def replay():
    if len(memory) < BATCH_SIZE:
        return
    batch = random.sample(memory, BATCH_SIZE)
    states, actions, rewards, next_states, dones = zip(*batch)
    states = torch.FloatTensor(np.array(states))
    actions = torch.LongTensor(actions).unsqueeze(1)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)
    
    # Compute target Q-values
    next_q_values = target_net(next_states).max(1)[0].detach()
    target_q_values = rewards + (GAMMA * next_q_values * (1 - dones))
    
    # Compute current Q-values
    q_values = policy_net(states).gather(1, actions).squeeze(1)
    
    # Loss and backpropagation
    loss = criterion(q_values, target_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [8]:
# Train DQN Model
for episode in range(EPISODES):
    state = env.reset()[0]
    total_reward = 0
    done = False
    
    while not done:
        action = act(state)
        next_state, reward, done, _, _ = env.step(action)
        remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        replay()
    
    # Decay epsilon
    if EPSILON > EPSILON_MIN:
        EPSILON *= EPSILON_DECAY
    
    # Update Target Network Periodically
    if episode % 10 == 0:
        target_net.load_state_dict(policy_net.state_dict())
    
    print(f"Episode {episode + 1}: Reward = {total_reward}")

Episode 1: Reward = 21.0
Episode 2: Reward = 34.0
Episode 3: Reward = 32.0
Episode 4: Reward = 9.0
Episode 5: Reward = 11.0
Episode 6: Reward = 12.0
Episode 7: Reward = 16.0
Episode 8: Reward = 28.0
Episode 9: Reward = 15.0
Episode 10: Reward = 26.0
Episode 11: Reward = 23.0
Episode 12: Reward = 17.0
Episode 13: Reward = 11.0
Episode 14: Reward = 11.0
Episode 15: Reward = 11.0
Episode 16: Reward = 14.0
Episode 17: Reward = 11.0
Episode 18: Reward = 9.0
Episode 19: Reward = 13.0
Episode 20: Reward = 10.0
Episode 21: Reward = 14.0
Episode 22: Reward = 13.0
Episode 23: Reward = 17.0
Episode 24: Reward = 43.0
Episode 25: Reward = 9.0
Episode 26: Reward = 11.0
Episode 27: Reward = 12.0
Episode 28: Reward = 9.0
Episode 29: Reward = 10.0
Episode 30: Reward = 9.0
Episode 31: Reward = 10.0
Episode 32: Reward = 13.0
Episode 33: Reward = 18.0
Episode 34: Reward = 10.0
Episode 35: Reward = 21.0
Episode 36: Reward = 34.0
Episode 37: Reward = 19.0
Episode 38: Reward = 29.0
Episode 39: Reward = 35.0


In [9]:
# Save the trained policy model
torch.save(policy_net.state_dict(), "dqn_cartpole.pth")
print("Model saved and ready for deployment!")

Model saved and ready for deployment!
