In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from collections import deque
import random
import numpy as np
import matplotlib.pyplot as plt

class DQN(nn.Module):
    def __init__(self,input_size, output_size, hidden_layers):
        super().__init__()

        layers = []
        previous_size = input_size
        for hidden_size in hidden_layers:
            layers.append(nn.Linear(previous_size, hidden_size))
            layers.append(nn.ReLU())
            previous_size = hidden_size

        layers.append(nn.Linear(previous_size, output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)
    
class ReplayMemory():
    def __init__(self, max_size):
        self.memory = deque(maxlen=max_size)

    def addData(self, dataPoint):
        self.memory.append(dataPoint)

    def sample(self, batch_size):
        return random.sample(population=self.memory, k=batch_size)

    def __len__(self):
        return len(self.memory)
    
def optimize(optimizer, loss_function, discount_factor, train_data, policyDQN, targetDQN, device):
    states, actions, next_states, rewards, dones = zip(*train_data)

    # Convert to numpy arrays first then tenors
    states = torch.tensor(np.array(states), dtype=torch.float32, device=device)
    next_states = torch.tensor(np.array(next_states), dtype=torch.float32, device=device)
    actions = torch.tensor(actions, dtype=torch.long, device=device)
    rewards = torch.tensor(rewards, dtype=torch.float32, device=device)
    dones = torch.tensor(dones, dtype=torch.float32, device=device)

    current_q = policyDQN(states)

    with torch.no_grad():
        next_q = targetDQN(next_states).max(1)[0]
        target_values = rewards + discount_factor * next_q * (1 - dones)

    current_q_selected = current_q[torch.arange(len(states)), actions]

    loss = loss_function(current_q_selected, target_values)

    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(policyDQN.parameters(), max_norm=5.0)
    optimizer.step()

    return loss.item()

def stateToDQNInput(state, device):
    state = np.array(state)
    state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    return state_tensor.to(device)

def deepQLearning(epsInit, epsDecay, epsEnd, numEpisodes, stepsToSync, discountFactor, 
                  learningRate, minibatchSize, episodesBeforeReplay):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    env = gym.make('CartPole-v1', render_mode="rgb_array")
    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    policyDQN = DQN(input_size=obs_size, output_size=action_size, hidden_layers=[64 * obs_size, 64 * obs_size]).to(device)
    targetDQN = DQN(input_size=obs_size, output_size=action_size, hidden_layers=[64 * obs_size, 64 * obs_size]).to(device)
    targetDQN.load_state_dict(policyDQN.state_dict())

    replayMemory = ReplayMemory(max_size=50_000)

    epsilon = epsInit

    optimizer = optim.Adam(policyDQN.parameters(), lr=learningRate)
    lossFunction = nn.MSELoss()

    rewardsPerEpisode = []
    epsilonHistory = []
    step_count = 0
    for episode in range(numEpisodes):
        state, _ = env.reset()
        terminated = False
        truncated = False
        sumOfRewards = 0
        while (not terminated) and (not truncated):
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_values = policyDQN(stateToDQNInput(state, device))
                    action = q_values.argmax().item()

            next_state, reward, terminated, truncated, _ = env.step(action)
            replayMemory.addData((state, action, next_state, reward, terminated))

            sumOfRewards += reward
            state = next_state
            step_count += 1

            if (len(replayMemory) >= minibatchSize) and (episode >= episodesBeforeReplay):
                minibatch = replayMemory.sample(batch_size=minibatchSize)
                optimize(optimizer=optimizer, loss_function=lossFunction, discount_factor=discountFactor,
                         train_data=minibatch, policyDQN=policyDQN, targetDQN=targetDQN, device=device)
                
                if step_count > stepsToSync:
                    targetDQN.load_state_dict(policyDQN.state_dict())
                    step_count = 0

        rewardsPerEpisode.append(sumOfRewards)
        epsilonHistory.append(epsilon)

        if episode % 10 == 0:
            print(f"Episode {episode}, Reward: {sumOfRewards:.2f}, Epsilon: {epsilon:.4f}")

        epsilon = max(epsEnd, epsilon * epsDecay)

    env.close()

    plt.figure(1)

    meanRewards = np.zeros(numEpisodes)
    for i in range(numEpisodes):
        meanRewards[i] = np.mean(rewardsPerEpisode[max(0, i - 100):(i + 1)])
    plt.subplot(121)
    plt.plot(meanRewards)

    plt.subplot(122)
    plt.plot(epsilonHistory)

    plt.show()

    return policyDQN

if __name__ == "__main__":
    trainedDQN = deepQLearning(
        epsInit=1.0,
        epsDecay=0.995,
        epsEnd=0.001,
        numEpisodes=5_000,
        stepsToSync=5_000,
        discountFactor=0.99,
        learningRate=0.0005,
        minibatchSize=128,
        episodesBeforeReplay=32
    )

Episode 0, Reward: 16.00, Epsilon: 1.0000
Episode 10, Reward: 67.00, Epsilon: 0.9511
Episode 20, Reward: 22.00, Epsilon: 0.9046
Episode 30, Reward: 18.00, Epsilon: 0.8604
Episode 40, Reward: 21.00, Epsilon: 0.8183
Episode 50, Reward: 15.00, Epsilon: 0.7783
Episode 60, Reward: 23.00, Epsilon: 0.7403
Episode 70, Reward: 9.00, Epsilon: 0.7041
Episode 80, Reward: 12.00, Epsilon: 0.6696
Episode 90, Reward: 12.00, Epsilon: 0.6369
Episode 100, Reward: 16.00, Epsilon: 0.6058
Episode 110, Reward: 21.00, Epsilon: 0.5762
Episode 120, Reward: 12.00, Epsilon: 0.5480
Episode 130, Reward: 15.00, Epsilon: 0.5212
Episode 140, Reward: 13.00, Epsilon: 0.4957
Episode 150, Reward: 12.00, Epsilon: 0.4715
Episode 160, Reward: 10.00, Epsilon: 0.4484
Episode 170, Reward: 18.00, Epsilon: 0.4265
Episode 180, Reward: 16.00, Epsilon: 0.4057
Episode 190, Reward: 9.00, Epsilon: 0.3858
Episode 200, Reward: 15.00, Epsilon: 0.3670
Episode 210, Reward: 13.00, Epsilon: 0.3490
Episode 220, Reward: 11.00, Epsilon: 0.3320
E