In [1]:
'''
Installing packages for rendering the game on Colab
'''

!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install git+https://github.com/tensorflow/docs > /dev/null 2>&1
!pip install gym[classic_control]



In [2]:
!pip install wandb
import wandb
# Replace with your actual API key
api_key = "17dab9d1bbdc37c41831799a4b0b50d3e97400c5"

# Login to Weights & Biases
wandb.login(key=api_key)
project_name = 'D1AS1'

Collecting wandb
  Using cached wandb-0.16.6-py3-none-any.whl (2.2 MB)
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Using cached GitPython-3.1.43-py3-none-any.whl (207 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Using cached sentry_sdk-1.44.1-py2.py3-none-any.whl (266 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Using cached docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)
  Using cached gitdb-4.0.11-py3-none-any.whl (62 kB)
Installing collected packages: sentry-sdk, gitdb, docker-pycreds, GitPython, wandb
Successfully installed GitPython-3.1.43 docker-pycreds-0.4.0 gitdb-4.0.11 sentry-sdk-1.44.1 wandb-0.16.6


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
import matplotlib.pyplot as plt
from IPython.display import HTML
from pyvirtualdisplay import Display
import tensorflow as tf
from IPython import display as ipythondisplay
from PIL import Image
import tensorflow_probability as tfp
import wandb

In [None]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:

# Define the Dueling DQN model
class DuelingDQN(nn.Module):
    def __init__(self, input_size, output_size, fc1_units, fc2_units, seed):
        super(DuelingDQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(input_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.advantage = nn.Linear(fc2_units, output_size)
        self.value = nn.Linear(fc2_units, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        advantage = self.advantage(x)
        value = self.value(x)
        q_values = value + (advantage - advantage.mean(dim=-1, keepdim=True))
        return q_values


In [None]:
# Define the ReplayBuffer class
class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)


In [None]:
# Define the agent
class Agent():
    def __init__(self, state_size, action_size, fc1_units, fc2_units, buffer_size, batch_size, lr, update_every, gamma, eps_end, eps_decay, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.lr = lr
        self.update_every = update_every
        self.gamma = gamma
        self.seed = random.seed(seed)
        self.q_network = DuelingDQN(state_size, action_size, fc1_units, fc2_units, seed).to(device)
        self.target_network = DuelingDQN(state_size, action_size, fc1_units, fc2_units, seed).to(device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        ''' Replay memory '''
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0
        self.eps_end = eps_end
        self.eps_decay = eps_decay

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) >= self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())

    def act(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.q_network.eval()
        with torch.no_grad():
            action_values = self.q_network(state)
        self.q_network.train()
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        Q_targets_next = self.target_network(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        Q_expected = self.q_network(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        # wandb.log({'train loss': loss})
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.q_network.parameters():
            param.grad.data.clamp_(-1, 1)  # Gradient clipping
        self.optimizer.step()


In [None]:
def dqn(agent, env, n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores_window = deque(maxlen=100)
    episode_list_epsgrdy = []
    average_scores_epsgrdy = []
    average_regret_epsgrdy = []
    cumulative_regret_epsgrdy = []
    cummulative_regret_window=[]
    regret_window = []
    cumulative_regret = 0  # Initialize cumulative regret
    eps = eps_start

    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        regret = 0  # Initialize regret for this episode
        # cumulative_regret = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            optimal_action = np.argmax(agent.q_network(torch.from_numpy(state).float().unsqueeze(0)).cpu().data.numpy())
            optimal_reward = env.step(optimal_action)[1]  # Get the reward for the optimal action
            regret += optimal_reward - reward  # Calculate regret for this time step
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
                break


        scores_window.append(score)
        average_score = np.mean(scores_window)
        average_scores_epsgrdy.append(average_score)


        regret_window.append(regret)
        average_regret = np.mean(regret_window)
        average_regret_epsgrdy.append(average_regret)

        cumulative_regret += average_regret  # Update cumulative regret

        # cummulative_regret_window.append(cumulative_regret)
        # cummulative_average_regret = np.mean(cummulative_regret_window)
        cummulative_average_regret = average_regret
        cumulative_regret_epsgrdy.append(cumulative_regret)

        episode_list_epsgrdy.append(i_episode)
        wandb.log({'average_score': average_score})
        wandb.log({'average_regret': average_regret})
        wandb.log({"cummulative_regret": cumulative_regret})

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, average_score), end="")

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}\tRegret: {:.2f}'.format(i_episode, average_score, regret))

        # if np.mean(scores_window)>=195.0:
        #    print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        #    break

        if i_episode % 100 == 0  and np.mean(scores_window) >= -100:
            wandb.log({"episode_no": i_episode})
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            break
        eps = max(eps_end, eps_decay*eps)

    return episode_list_epsgrdy, average_scores_epsgrdy, average_regret_epsgrdy

In [None]:

# Initialize wandb with your project name
wandb.init(project=project_name)

In [None]:
env = gym.make('Acrobot-v1')
env.seed(0)
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n

In [None]:
# Sweep configuration
sweep_config = {
    "method": "grid",
    "metric": {"goal": "minimize", "name": "cumulative_regret"},
    "parameters": {
        'state_size': {
            'values': [state_shape]
        },
        'action_size': {
            'values': [action_shape]
        },
        'BUFFER_SIZE': {
            'values': [int(1e5)]
            # 'values': [int(1e4), int(1e5), int(1e6)]
        },
        'BATCH_SIZE': {
            'values': [32, 64]
            # 'values': [32, 64, 128, 256]
        },
        'LR': {
            'values': [0.001, 0.0001]
            # 'values': [0.1, 0.01, 0.001, 0.0001]
        },
        'UPDATE_EVERY': {
            'values': [10, 20]
            # 'values': [4, 6, 10, 20]
        },
        'fc1_units': {
            # 'values': [128]
            'values': [64, 128]
            # 'values': [64, 128, 256]
        },
        'fc2_units': {
            # 'values': [64]
            'values': [64, 128]
            # 'values': [64, 128, 256]
        },
        'eps_start': {
            'values': [1]
        },
        'eps_end': {
            'values': [0.01]
            # 'values': [0.01, 0.05, 0.1]
        },
        'eps_decay': {
            'values': [0.99, 0.995]
            # 'values': [0.9, 0.95, 0.99, 0.995, 0.999]
        },
        'gamma': {
            'values': [0.99]
        },
        'n_episodes': {
            'values': [2000]
            # 'values': [1000, 2000, 5000]
        },
        'max_t': {
            'values': [500, 1000]
            # 'values': [500, 1000, 2000]
        },
    }
}
sweep_id = wandb.sweep(sweep=sweep_config, project=project_name)
max_sweep_run = 14 #update it

In [None]:


# Define the main function for hyperparameter tuning
def main():

    with wandb.init() as run:
        # Get the hyperparameters for this run
        config = wandb.config

        begin_time = datetime.datetime.now()



        # Create the agent with the hyperparameters
        agent = Agent(state_size=config.state_size, action_size=config.action_size, fc1_units=config.fc1_units, fc2_units=config.fc2_units, buffer_size=config.BUFFER_SIZE, batch_size=config.BATCH_SIZE, lr=config.LR, update_every=config.UPDATE_EVERY, gamma=config.gamma, eps_end=config.eps_end, eps_decay=config.eps_decay,seed = 0)

        # Train the agent and get the scores
        # episode_list_epsgrdy, average_scores_epsgrdy = dqn(agent, env, n_episodes=config.n_episodes, max_t=config.max_t, eps_start=config.eps_start, eps_end=config.eps_end, eps_decay=config.eps_decay)

        episode_list_epsgrdy, average_scores_epsgrdy, average_regret_epsgrdy = dqn(agent, env, n_episodes=config.n_episodes, max_t=config.max_t, eps_start=config.eps_start, eps_end=config.eps_end, eps_decay=config.eps_decay)

        time_taken = datetime.datetime.now() - begin_time

        print(time_taken)
        # Log the final average score to wandb
        # wandb.log({"Average Score": average_scores_epsgrdy})
        # wandb.log({"Average Score": cumulative_regret})



In [None]:

# # Run the sweeps for First Approach (Narrow down to effective parameters)
wandb.agent(sweep_id, function=main, count=max_sweep_run)

wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: kgzowanx with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 32
[34m[1mwandb[0m: 	BUFFER_SIZE: 100000
[34m[1mwandb[0m: 	LR: 0.001
[34m[1mwandb[0m: 	UPDATE_EVERY: 10
[34m[1mwandb[0m: 	action_size: 3
[34m[1mwandb[0m: 	eps_decay: 0.99
[34m[1mwandb[0m: 	eps_end: 0.01
[34m[1mwandb[0m: 	eps_start: 1
[34m[1mwandb[0m: 	fc1_units: 64
[34m[1mwandb[0m: 	fc2_units: 64
[34m[1mwandb[0m: 	gamma: 0.99
[34m[1mwandb[0m: 	max_t: 500
[34m[1mwandb[0m: 	n_episodes: 2000
[34m[1mwandb[0m: 	state_size: 6
Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
Exception in thread NetStatThr:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
Exception in thread IntMsgThr:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  Fil

  if not isinstance(terminated, (bool, np.bool8)):


Episode 100	Average Score: -87.67	Regret: 0.00

Environment solved in 100 episodes!	Average Score: -87.67
0:00:43.044288


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
average_regret,▁█▆▆▆▅▆▆▆▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
average_score,▁▃▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████
cummulative_regret,▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
episode_no,▁

0,1
average_regret,0.35
average_score,-87.67
cummulative_regret,38.66214
episode_no,100.0


[34m[1mwandb[0m: Agent Starting Run: ipkn5rvv with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 32
[34m[1mwandb[0m: 	BUFFER_SIZE: 100000
[34m[1mwandb[0m: 	LR: 0.001
[34m[1mwandb[0m: 	UPDATE_EVERY: 10
[34m[1mwandb[0m: 	action_size: 3
[34m[1mwandb[0m: 	eps_decay: 0.99
[34m[1mwandb[0m: 	eps_end: 0.01
[34m[1mwandb[0m: 	eps_start: 1
[34m[1mwandb[0m: 	fc1_units: 64
[34m[1mwandb[0m: 	fc2_units: 64
[34m[1mwandb[0m: 	gamma: 0.99
[34m[1mwandb[0m: 	max_t: 1000
[34m[1mwandb[0m: 	n_episodes: 2000
[34m[1mwandb[0m: 	state_size: 6


Episode 100	Average Score: -96.25	Regret: 0.00

Environment solved in 100 episodes!	Average Score: -96.25
0:00:45.585323


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
average_regret,▁▁▁▁▃▄▂▄▆▆▆▇▇▆▅▇▇▇▇▇█████▇▇███▇▇▇▇▇▇▇▇▇▇
average_score,▁▁▁▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
cummulative_regret,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇██
episode_no,▁

0,1
average_regret,0.32
average_score,-96.25
cummulative_regret,26.27982
episode_no,100.0


[34m[1mwandb[0m: Agent Starting Run: n8lt085z with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 32
[34m[1mwandb[0m: 	BUFFER_SIZE: 100000
[34m[1mwandb[0m: 	LR: 0.001
[34m[1mwandb[0m: 	UPDATE_EVERY: 10
[34m[1mwandb[0m: 	action_size: 3
[34m[1mwandb[0m: 	eps_decay: 0.99
[34m[1mwandb[0m: 	eps_end: 0.01
[34m[1mwandb[0m: 	eps_start: 1
[34m[1mwandb[0m: 	fc1_units: 64
[34m[1mwandb[0m: 	fc2_units: 128
[34m[1mwandb[0m: 	gamma: 0.99
[34m[1mwandb[0m: 	max_t: 500
[34m[1mwandb[0m: 	n_episodes: 2000
[34m[1mwandb[0m: 	state_size: 6


Episode 6	Average Score: -149.67