In [1]:
'''
Installing packages for rendering the game on Colab
'''

!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install git+https://github.com/tensorflow/docs > /dev/null 2>&1
!pip install gym[classic_control]

Collecting setuptools
  Downloading setuptools-69.2.0-py3-none-any.whl (821 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.5/821.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 67.7.2
    Uninstalling setuptools-67.7.2:
      Successfully uninstalled setuptools-67.7.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0mSuccessfully installed setuptools-69.2.0


Collecting pygame==2.1.0 (from gym[classic_control])
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
!pip install wandb
import wandb
# Replace with your actual API key
api_key = "8f58df9a66485e9ea9149b8b599cb14eb71832dc"

# Login to Weights & Biases
wandb.login(key=api_key)

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)


In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
import matplotlib.pyplot as plt
from IPython.display import HTML
from pyvirtualdisplay import Display
import tensorflow as tf
from IPython import display as ipythondisplay
from PIL import Image
import tensorflow_probability as tfp
import wandb

In [None]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:


'''
List of example environments
(Source - https://gym.openai.com/envs/#classic_control)
'Acrobot-v1'
'Cartpole-v1'

'''

env = gym.make('CartPole-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

# print(state_shape)
# print(no_of_actions)
# print(env.action_space.sample())
# print("----")

'''
# Understanding State, Action, Reward Dynamics

The agent decides an action to take depending on the state.

The Environment keeps a variable specifically for the current state.
- Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
- It returns the new current state and reward for the agent to take the next action

'''

state = env.reset()
''' This returns the initial state (when environment is reset) '''

# print(state)
# print("----")

action = env.action_space.sample()
''' We take a random action now '''

# print(action)
# print("----")

next_state, reward, done, info = env.step(action)
''' env.step is used to calculate new state and obtain reward based on old state and action taken  '''

# print(next_state)
# print(reward)
# print(done)
# print(info)
# print("----")

state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n
state_size = state_shape


In [None]:

# Define the Dueling DQN model
class DuelingDQN(nn.Module):
    def __init__(self, input_size, output_size, fc1_units, fc2_units, seed):
        super(DuelingDQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(input_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.advantage = nn.Linear(fc2_units, output_size)
        self.value = nn.Linear(fc2_units, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        advantage = self.advantage(x)
        value = self.value(x)
        q_values = value + (advantage - advantage.mean(dim=-1, keepdim=True))
        return q_values


In [None]:
# Define the ReplayBuffer class
class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)


In [None]:
# Define the agent
class Agent():
    def __init__(self, state_size, action_size, fc1_units, fc2_units, buffer_size, batch_size, lr, update_every, gamma, eps_end, eps_decay, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.lr = lr
        self.update_every = update_every
        self.gamma = gamma
        self.seed = random.seed(seed)
        self.q_network = DuelingDQN(state_size, action_size, fc1_units, fc2_units, seed).to(device)
        self.target_network = DuelingDQN(state_size, action_size, fc1_units, fc2_units, seed).to(device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        ''' Replay memory '''
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0
        self.eps_end = eps_end
        self.eps_decay = eps_decay

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) >= self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())

    def act(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.q_network.eval()
        with torch.no_grad():
            action_values = self.q_network(state)
        self.q_network.train()
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        Q_targets_next = self.target_network(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        Q_expected = self.q_network(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        wandb.log({'train loss': loss})
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.q_network.parameters():
            param.grad.data.clamp_(-1, 1)  # Gradient clipping
        self.optimizer.step()


In [None]:
# # Define the DQN algorithm
# def dqn(agent, env, n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
#     scores_window = deque(maxlen=100)
#     episode_list_epsgrdy = []
#     average_scores_epsgrdy = []
#     eps = eps_start

#     for i_episode in range(1, n_episodes+1):
#         state = env.reset()
#         score = 0

#         for t in range(max_t):
#             action = agent.act(state, eps)
#             next_state, reward, done, _ = env.step(action)
#             agent.step(state, action, reward, next_state, done)
#             state = next_state
#             score += reward

#             if done:
#                 break

#         scores_window.append(score)
#         average_score = np.mean(scores_window)
#         episode_list_epsgrdy.append(i_episode)
#         average_scores_epsgrdy.append(average_score)
#         eps = max(eps_end, eps_decay*eps)
#         print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, average_score), end="")

#         if i_episode % 100 == 0:
#             print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, average_score))

#         if average_score >= 195.0:
#             print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, average_score))
#             break



#     return episode_list_epsgrdy, average_scores_epsgrdy


In [None]:
def dqn(agent, env, n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores_window = deque(maxlen=100)
    episode_list_epsgrdy = []
    average_scores_epsgrdy = []
    cumulative_regret = 0  # Initialize cumulative regret
    eps = eps_start

    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        regret = 0  # Initialize regret for this episode

        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            optimal_action = np.argmax(agent.q_network(torch.from_numpy(state).float().unsqueeze(0)).cpu().data.numpy())
            optimal_reward = env.step(optimal_action)[1]  # Get the reward for the optimal action
            regret += optimal_reward - reward  # Calculate regret for this time step
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
                break

        cumulative_regret += regret  # Update cumulative regret
        scores_window.append(score)
        average_score = np.mean(scores_window)
        episode_list_epsgrdy.append(i_episode)
        average_scores_epsgrdy.append(average_score)
        wandb.log({'average_score': average_scores_epsgrdy})
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, average_score), end="")

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, average_score))

        if average_score >= 195.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, average_score))
            break

        eps = max(eps_end, eps_decay*eps)

    return episode_list_epsgrdy, average_scores_epsgrdy, cumulative_regret

In [None]:

# Initialize wandb with your project name
wandb.init(project="DeulingDQN")

In [None]:
# Sweep configuration
sweep_config = {
    "method": "random",
    "metric": {"goal": "minimize", "name": "cumulative_regret"},
    "parameters": {
        'state_size': {
            'values': [4]
        },
        'action_size': {
            'values': [2]
        },
        'BUFFER_SIZE': {
            # 'values': [int(1e5)]
            'values': [int(1e4), int(1e5), int(1e6)]
        },
        'BATCH_SIZE': {
            # 'values': [64]
            'values': [32, 64, 128, 256]
        },
        'LR': {
            # 'values': [5e-4]
            'values': [0.1, 0.01, 0.001, 0.0001]
        },
        'UPDATE_EVERY': {
            # 'values': [20]
            'values': [4, 6, 10, 20]
        },
        'fc1_units': {
            # 'values': [128]
            'values': [64, 128, 256]
        },
        'fc2_units': {
            # 'values': [64]
            'values': [64, 128, 256]
        },
        'eps_start': {
            'values': [1]
        },
        'eps_end': {
            # 'values': [0.01]
            'values': [0.01, 0.05, 0.1]
        },
        'eps_decay': {
            # 'values': [0.995]
            'values': [0.9, 0.95, 0.99, 0.995, 0.999]
        },
        'gamma': {
            'values': [0.99]
        },
        'n_episodes': {
            # 'values': [10000]
            'values': [1000, 2000, 5000]
        },
        'max_t': {
            # 'values': [1000]
            'values': [500, 1000, 2000]
        },
    }
}
sweep_id = wandb.sweep(sweep=sweep_config, project='DeulingDQN')

In [None]:


# Define the main function for hyperparameter tuning
def main():

    with wandb.init() as run:
        # Get the hyperparameters for this run
        config = wandb.config

        begin_time = datetime.datetime.now()



        # Create the agent with the hyperparameters
        agent = Agent(state_size=config.state_size, action_size=config.action_size, fc1_units=config.fc1_units, fc2_units=config.fc2_units, buffer_size=config.BUFFER_SIZE, batch_size=config.BATCH_SIZE, lr=config.LR, update_every=config.UPDATE_EVERY, gamma=config.gamma, eps_end=config.eps_end, eps_decay=config.eps_decay,seed = 0)

        # Train the agent and get the scores
        # episode_list_epsgrdy, average_scores_epsgrdy = dqn(agent, env, n_episodes=config.n_episodes, max_t=config.max_t, eps_start=config.eps_start, eps_end=config.eps_end, eps_decay=config.eps_decay)

        episode_list_epsgrdy, average_scores_epsgrdy, cumulative_regret = dqn(agent, env, n_episodes=config.n_episodes, max_t=config.max_t, eps_start=config.eps_start, eps_end=config.eps_end, eps_decay=config.eps_decay)

        time_taken = datetime.datetime.now() - begin_time

        print(time_taken)
        # Log the final average score to wandb
        wandb.log({"Average Score": average_scores_epsgrdy})
        wandb.log({"Average Score": cumulative_regret})



In [None]:

# # Run the sweep
wandb.agent(sweep_id, function=main, count=10)

wandb.finish()