In [1]:
!pip install wandb
import wandb
# Replace with your actual API key
api_key = "8f58df9a66485e9ea9149b8b599cb14eb71832dc"

# Login to Weights & Biases
wandb.login(key=api_key)

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.44.1-py2.py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
torch.manual_seed(0)
import base64
import io  # For visualization
from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display
import glob



device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

env = gym.make('Acrobot-v1', new_step_api=True)  # Set new_step_api=True
env.reset(seed=0)  # Use reset instead of seed
print('observation space:', env.observation_space)
print('action space:', env.action_space)


wandb.init(project="acrobot-reinforcebbbbb", config={  # Initialize wandb first
    "hidden_size": 32,  # Default values
    "n_episodes": 1000,
    "max_t": 1000,
    "gamma": 0.99,
    "print_every": 100,
    "lr": 1e-2,
})

config = {
    "hidden_size": wandb.config.hidden_size,  # Access wandb.config after initialization
    "n_episodes": wandb.config.n_episodes,
    "max_t": wandb.config.max_t,
    "gamma": wandb.config.gamma,
    "print_every": wandb.config.print_every,
    "lr": wandb.config.lr,
}

class Policy(nn.Module):
    def __init__(self, state_size=6, action_size=3, hidden_size=config["hidden_size"]):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        # we just consider 1 dimensional probability of action
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action)

class Baseline(nn.Module):
    def __init__(self, state_size, hidden_size=32):
        super(Baseline, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        return x


def reinforce(env, policy, optimizer, baseline, baseline_optimizer, n_episodes=1000, max_t=1000, gamma=0.99, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for e in range(1, n_episodes):
        saved_log_probs = []
        rewards = []
        states = []
        state = env.reset()
        for t in range(max_t):
            states.append(state)
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, truncated, _ = env.step(action)  # Unpack five values
            rewards.append(reward)
            if done or truncated:  # Check for both done and truncated
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        discounts = [gamma ** i for i in range(len(rewards) + 1)]
        R = sum([a * b for a, b in zip(discounts, rewards)])

        # Calculate the baseline
        baseline_values = []
        for state_in_trajectory in states:
            state_tensor = torch.from_numpy(state_in_trajectory).float().unsqueeze(0).to(device)
            baseline_value = baseline(state_tensor)
            baseline_values.append(baseline_value)
        baseline_values = torch.cat(baseline_values, dim=0).requires_grad_()

        # Calculate the policy loss
        policy_loss = []
        for log_prob, baseline_value in zip(saved_log_probs, baseline_values):
            policy_loss.append(-(log_prob * (R - baseline_value)))
        policy_loss = torch.cat(policy_loss).sum()

        # Update the policy
        optimizer.zero_grad()
        policy_loss.backward(retain_graph=True)
        optimizer.step()

        # Update the baseline
        baseline_optimizer.zero_grad()
        baseline_loss = ((baseline_values - R) ** 2).mean()
        baseline_loss.backward()
        baseline_optimizer.step()

        if e % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
        if e % 100 == 0 and np.mean(scores_deque) >= -100:
            break

    return scores



wandb.init(project="acrobot-reinforcebbbbb", config=config)  # Initialize wandb

policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
baseline = Baseline(state_size=env.observation_space.shape[0]).to(device)
baseline_optimizer = optim.Adam(baseline.parameters(), lr=1e-2)
scores = reinforce(env, policy, optimizer, baseline, baseline_optimizer, n_episodes=2000)
wandb.finish()  # Finish wandb run

[34m[1mwandb[0m: Currently logged in as: [33mbhavik-160990105023[0m. Use [1m`wandb login --relogin`[0m to force relogin


cpu
observation space: Box([ -1.        -1.        -1.        -1.       -12.566371 -28.274334], [ 1.        1.        1.        1.       12.566371 28.274334], (6,), float32)
action space: Discrete(3)


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  if not isinstance(terminated, (bool, np.bool8)):


Episode 100	Average Score: -449.01
Episode 200	Average Score: -309.61
Episode 300	Average Score: -203.08
Episode 400	Average Score: -132.32
Episode 500	Average Score: -106.03
Episode 600	Average Score: -91.28


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
sweep_config = {
    'method': 'grid',
    'metric': {
      'name': 'Average Reward',
      'goal': 'maximize'
    },
    'parameters': {
        'hidden_size': {
            'values': [32,64,128]
        },
        'max_t': {
            'values': [500, 1000, 1500]
        },
        'lr': {
            'values': [1e-5, 1e-3,1e-4]
        }
    }
}

In [None]:
# import wandb

# def train(config=None):
#     with wandb.init(config=config):
#         config = wandb.config

#         # Define the hyperparameters
#         hidden_size = config.hidden_size
#         n_episodes = config.n_episodes
#         max_t = config.max_t
#         lr = config.lr

#         # Initialize the policy and optimizer
#         policy = Policy(hidden_size=hidden_size).to(device)
#         optimizer = optim.Adam(policy.parameters(), lr=lr)

#         def reinforce(policy, optimizer, n_episodes=n_episodes, max_t=max_t, gamma=1.0, print_every=100):
#             scores_deque = deque(maxlen=100)
#             scores = []

#             for e in range(1, n_episodes+1):
#                 saved_log_probs = []
#                 rewards = []
#                 state = env.reset()

#                 # Collect trajectory
#                 for t in range(max_t):
#                     # Sample the action from current policy
#                     action, log_prob = policy.act(state)
#                     saved_log_probs.append(log_prob)
#                     state, reward, done, _ = env.step(action)
#                     rewards.append(reward)
#                     if done:
#                         break

#                 # Calculate the total regret
#                 scores_deque.append(sum(rewards))
#                 scores.append(sum(rewards))

#                 # Recalculate the total regret applying discounted factor
#                 discounts = [gamma ** i for i in range(len(rewards) + 1)]
#                 R = sum([a * b for a, b in zip(discounts, rewards)])

#                 # Calculate the loss
#                 policy_loss = []
#                 for log_prob in saved_log_probs:
#                     # Minimizing the regret instead of maximizing the reward
#                     policy_loss.append(log_prob * R)
#                 policy_loss = torch.cat(policy_loss).sum()

#                 # Backpropagation
#                 optimizer.zero_grad()
#                 policy_loss.backward()
#                 optimizer.step()

#                 if e % print_every == 0:
#                     print(f'Episode {e}\tAverage Regret: {np.mean(scores_deque):.2f}')

#                 wandb.log({"regret": np.mean(scores_deque)})

#             return scores

#         scores = reinforce(policy, optimizer)
#         wandb.run.summary["best_regret"] = np.min(scores)

# # Run the sweep
# sweep_id = wandb.sweep(sweep_config, entity="bhavik-160990105023", project="assignment2")
# wandb.agent(sweep_id, train, count=20)

In [None]:
# sweep_config = {
#     'method': 'random',
#     'metric': {
#       'name': 'final_regret',
#       'goal': 'minimize'
#     },
#     'parameters': {
#         'hidden_size':{
#             'values': [1000, 2000, 3000]
#         },
#         'n_episodes': {
#             'values': [0, 0.0005, 0.005]
#         },
#         'max_t': {
#             'values': [500, 1000, 1500]
#         },
#         'lr': {
#             'values': [1e-2, 1e-3, 1e-4]
#         }
#     }
# }

In [None]:
# Initialize a new sweep
sweep_id = wandb.sweep(sweep_config, entity="bhavik-160990105023", project="acrobot-reinforcebbbbb")

Create sweep with ID: ntoty7hf
Sweep URL: https://wandb.ai/bhavik-160990105023/acrobot-reinforcebbbbb/sweeps/ntoty7hf


In [None]:
# def train(config=None):
#     # Set default values for hyperparameters
#     default_config = {
#         "hidden_size": 32,
#         "n_episodes": 1000,
#         "max_t": 1000,
#         "gamma": 1.0,
#         "print_every": 100,
#         "lr": 1e-2,
#     }

#     # Initialize a new wandb run
#     run = wandb.init(config=config, reinit=True)

#     # If config is None, use the default values
#     if config is None:
#         run.config.update(default_config, allow_val_change=True)
#         config = run.config

#     # Initialize the environment and seed
#     env = gym.make('Acrobot-v1')
#     env.seed(0)

#     # Create the policy network with the specified hyperparameters
#     policy = Policy(state_size=6, action_size=3, hidden_size=config.hidden_size).to(device)
#     optimizer = optim.Adam(policy.parameters(), lr=config.lr)

#     # Run the REINFORCE algorithm with the specified hyperparameters
#     scores = reinforce(policy, optimizer, n_episodes=config.n_episodes, max_t=config.max_t, gamma=config.gamma, print_every=config.print_every)

#     # Log the final score as a summary metric
#     run.summary["final_score"] = np.mean(scores[-100:])

#     # Finish the wandb run
#     run.finish()

#     # Return any necessary values or metrics
#     return np.mean(scores[-100:])

In [None]:
# def train(config=None):
#     # Set default values for hyperparameters
#     default_config = {
#         "hidden_size": 32,
#         "n_episodes": 2000,
#         "max_t": 1000,
#         "gamma": 1.0,
#         "print_every": 100,
#         "lr": 1e-2,
#     }

#     # Initialize a new wandb run
#     run = wandb.init(config=config, reinit=True)

#     # If config is None, use the default values
#     if config is None:
#         config = default_config

#     run.config.update(config, allow_val_change=True)

#     # Initialize the environment and seed
#     env = gym.make('Acrobot-v1')
#     env.seed(0)

#     # Create the policy network with the specified hyperparameters
#     policy = Policy(state_size=6, action_size=3, hidden_size=config['hidden_size']).to(device)
#     optimizer = optim.Adam(policy.parameters(), lr=config['lr'])

#     # Run the REINFORCE algorithm with the specified hyperparameters
#     scores = reinforce(policy, optimizer, n_episodes=config['n_episodes'], max_t=config['max_t'], gamma=config['gamma'], print_every=config['print_every'])

#     # Log the final score as a summary metric
#     run.summary["final_score"] = np.mean(scores[-100:])

#     # Finish the wandb run
#     run.finish()

#     # Return any necessary values or metrics
#     return np.mean(scores[-100:])

In [None]:
def train(config=None):
    # Set default values for hyperparameters
    default_config = {
        "hidden_size": 32,
        "n_episodes": 1000,
        "max_t": 1000,
        "gamma": 1.0,
        "print_every": 100,
        "lr": 1e-2,
        "baseline_lr": 1e-2,
    }

    # Initialize a new wandb run
    run = wandb.init(config=config, reinit=True)

    # If config is None, use the default values
    if config is None:
        run.config.update(default_config, allow_val_change=True)
    config = run.config

    # Initialize the environment and seed
    env = gym.make('Acrobot-v1')
    env.seed(0)

    # Get the state and action sizes for the Acrobot-v1 environment
    state_size = env.observation_space.shape[0]  # 6 dimensions
    action_size = env.action_space.n  # 3 actions

    # Create the policy network with the specified hyperparameters
    policy = Policy(state_size=state_size, action_size=action_size, hidden_size=config.hidden_size).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=config.lr)

    # Create the baseline network with the specified hyperparameters
    baseline = Baseline(state_size=state_size).to(device)
    baseline_optimizer = optim.Adam(baseline.parameters(), lr=config.baseline_lr)

    # Run the REINFORCE algorithm with the specified hyperparameters
    scores = reinforce(env, policy, optimizer, baseline, baseline_optimizer, n_episodes=config.n_episodes, max_t=config.max_t, gamma=config.gamma, print_every=config.print_every)

    # Check if the environment is solved
    if np.mean(scores[-100:]) >= -100:  # Adjust the threshold as needed
        print(f'Environment {env.unwrapped.spec.id} solved in {e - 100:d} episodes!\tAverage Score: {np.mean(scores_deque):.2f}')





    # Log the final score as a summary metric
    run.summary["final_score"] = np.mean(scores[-100:])

    # Finish the wandb run
    run.finish()

    # Return any necessary values or metrics
    return np.mean(scores[-100:])

In [None]:
sweep_id = wandb.sweep(sweep_config, project="acrobot-reinforcebbbbb")
wandb.agent(sweep_id, train, count=50)

Create sweep with ID: 3glra774
Sweep URL: https://wandb.ai/bhavik-160990105023/acrobot-reinforce/sweeps/3glra774


[34m[1mwandb[0m: Agent Starting Run: f7c2beo7 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 500


  deprecation(
  deprecation(
  deprecation(


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run f7c2beo7 errored:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/agents/pyagent.py", line 308, in _run_job
    self._function()
  File "<ipython-input-9-e488ce5cf39c>", line 38, in train
    scores = reinforce(env, policy, optimizer, baseline, baseline_optimizer, n_episodes=config.n_episodes, max_t=config.max_t, gamma=config.gamma, print_every=config.print_every)
  File "<ipython-input-5-6bda22063399>", line 91, in reinforce
    state, reward, done, truncated, _ = env.step(action)  # Unpack five values
ValueError: not enough values to unpack (expected 5, got 4)

[34m[1mwandb[0m: [32m[41mERROR[0m Run f7c2beo7 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.10/dist-packages/wandb/agents/pyagent.py", line 308, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ip



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


In [None]:
# wandb agent <sweep_id>