In [1]:
!pip install wandb
import wandb
# Replace with your actual API key
api_key = "8f58df9a66485e9ea9149b8b599cb14eb71832dc"

# Login to Weights & Biases
wandb.login(key=api_key)

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.44.1-py2.py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wa

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
torch.manual_seed(0)
import base64
import io  # For visualization
from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display
import glob


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

env = gym.make('CartPole-v1')
env.seed(0)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

wandb.init(project="R_CartPole_baseline", config={  # Initialize wandb first
    "hidden_size": 32,  # Default values
    "n_episodes": 1000,
    "max_t": 1000,
    "gamma": 0.99,
    "print_every": 100,
    "lr": 1e-2,
})

config = {
    "hidden_size": wandb.config.hidden_size,  # Access wandb.config after initialization
    "n_episodes": wandb.config.n_episodes,
    "max_t": wandb.config.max_t,
    "gamma": wandb.config.gamma,
    "print_every": wandb.config.print_every,
    "lr": wandb.config.lr,
}

class Policy(nn.Module):
    def __init__(self, state_size=4, action_size=2, hidden_size=config["hidden_size"]):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        # we just consider 1 dimensional probability of action
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        if state.shape[1] != 4:  # Check if the state tensor has the expected shape
            state = state.view(1, 4)  # Reshape the state tensor to (1, 4)
        probs = self.forward(state).cpu()
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action)


class Baseline(nn.Module):
    def __init__(self, state_size, hidden_size=32):
        super(Baseline, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        return x



def reinforce(env, policy, optimizer, baseline, baseline_optimizer, n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for e in range(1, n_episodes):
        saved_log_probs = []
        rewards = []
        states = []
        state = env.reset()
        for t in range(max_t):
            states.append(state)
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        discounts = [gamma ** i for i in range(len(rewards) + 1)]
        R = sum([a * b for a, b in zip(discounts, rewards)])

        # Calculate the baseline
        baseline_values = []
        for state_in_trajectory in states:
            state_tensor = torch.from_numpy(state_in_trajectory).float().unsqueeze(0).to(device)
            baseline_value = baseline(state_tensor)
            baseline_values.append(baseline_value)
        baseline_values = torch.cat(baseline_values, dim=0).requires_grad_()

        # Calculate the policy loss
        policy_loss = []
        for log_prob, baseline_value in zip(saved_log_probs, baseline_values):
            policy_loss.append(-(log_prob * (R - baseline_value)))
        policy_loss = torch.cat(policy_loss).sum()

        # Update the policy
        optimizer.zero_grad()
        policy_loss.backward(retain_graph=True)  # Retain the computation graph
        optimizer.step()

        # Update the baseline
        baseline_optimizer.zero_grad()
        baseline_loss = ((baseline_values - R) ** 2).mean()
        baseline_loss.backward()
        baseline_optimizer.step()

        if e % print_every == 0:
            print('Episode {}\\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
        if np.mean(scores_deque) >= 195.0:
            print(f'Environment {env.unwrapped.spec.id} solved in {e - 100:d} episodes!\\tAverage Score: {np.mean(scores_deque):.2f}')
            break

    return scores

############################################For Regret minimization###################################

# def reinforce(policy, optimizer, n_episodes=config["n_episodes"], max_t=config["max_t"], gamma=config["gamma"], print_every=config["print_every"]):
#     scores_deque = deque(maxlen=100)
#     scores = []
#     for e in range(1, n_episodes):
#         # ... (the rest of the code remains the same)
#         if e % print_every == 0:
#             print('Episode {}\\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
#             regret = 500 - np.mean(scores_deque)  # Calculate regret
#             wandb.log({"Regret": regret})
#         if np.mean(scores_deque) >= 495.0:  # Adjust the threshold for early stopping
#             print('Environment solved in {:d} episodes!\\tAverage Score: {:.2f}'.format(e - 100, np.mean(scores_deque)))
#             break
#     wandb.run.summary["final_regret"] = 500 - np.mean(scores_deque)  # Log final regret
#     return scores




#############################################################################

wandb.init(project="R_CartPole_baseline", config=config)  # Initialize wandb

policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
baseline = Baseline(state_size=env.observation_space.shape[0]).to(device)
baseline_optimizer = optim.Adam(baseline.parameters(), lr=1e-2)
scores = reinforce(env, policy, optimizer, baseline, baseline_optimizer, n_episodes=2000)
wandb.finish()  # Finish wandb run

cpu
observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
action space: Discrete(2)


  deprecation(
  deprecation(
  deprecation(
[34m[1mwandb[0m: Currently logged in as: [33mbhavik-160990105023[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  if not isinstance(terminated, (bool, np.bool8)):


Episode 100\tAverage Score: 29.38
Episode 200\tAverage Score: 64.28
Episode 300\tAverage Score: 115.99
Environment CartPole-v1 solved in 246 episodes!\tAverage Score: 195.18


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [3]:
# print(env.spec.reward_threshold)

  and should_run_async(code)


In [4]:
sweep_config = {
    'method': 'grid',
    'metric': {
      'name': 'Average Reward',
      'goal': 'maximize'
    },
    'parameters': {
        'hidden_size': {
            'values': [32,64,128]
        },
        'max_t': {
            'values': [500, 1000, 1500]
        },
        'lr': {
            'values': [1e-5, 1e-3,1e-4]
        }
    }
}

In [5]:
# Initialize a new sweep
sweep_id = wandb.sweep(sweep_config, entity="bhavik-160990105023", project="R_CartPole_baseline")

Create sweep with ID: thm46x8i
Sweep URL: https://wandb.ai/bhavik-160990105023/R_CartPole_baseline/sweeps/thm46x8i


In [6]:
def train(config=None):
    # Set default values for hyperparameters
    default_config = {
        "hidden_size": 32,
        "n_episodes": 1000,
        "max_t": 1000,
        "gamma": 1.0,
        "print_every": 100,
        "lr": 1e-2,
        "baseline_lr": 1e-2,
    }

    # Initialize a new wandb run
    run = wandb.init(config=config, reinit=True)

    # If config is None, use the default values
    if config is None:
        run.config.update(default_config, allow_val_change=True)
    config = run.config

    # Initialize the environment and seed
    env = gym.make('CartPole-v1')
    env.seed(0)

    # Create the policy network with the specified hyperparameters
    policy = Policy(state_size=4, action_size=2, hidden_size=config.hidden_size).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=config.lr)

    # Create the baseline network with the specified hyperparameters
    baseline = Baseline(state_size=4).to(device)
    baseline_optimizer = optim.Adam(baseline.parameters(), lr=config.baseline_lr)

    # Run the REINFORCE algorithm with the specified hyperparameters
    scores = reinforce(env, policy, optimizer, baseline, baseline_optimizer, n_episodes=config.n_episodes, max_t=config.max_t, gamma=config.gamma, print_every=config.print_every)

    # Log the final score as a summary metric
    run.summary["final_score"] = np.mean(scores[-100:])

    # Finish the wandb run
    run.finish()

    # Return any necessary values or metrics
    return np.mean(scores[-100:])

In [7]:
sweep_id = wandb.sweep(sweep_config, project="R_CartPole_baseline")
wandb.agent(sweep_id, train, count=50)

Create sweep with ID: 5w8xruix
Sweep URL: https://wandb.ai/bhavik-160990105023/R_CartPole_baseline/sweeps/5w8xruix


[34m[1mwandb[0m: Agent Starting Run: 9fs04x5x with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 500


  deprecation(
  deprecation(
  deprecation(


Episode 100\tAverage Score: 18.98
Episode 200\tAverage Score: 17.61
Episode 300\tAverage Score: 19.51
Episode 400\tAverage Score: 17.40
Episode 500\tAverage Score: 16.19
Episode 600\tAverage Score: 18.95
Episode 700\tAverage Score: 18.32
Episode 800\tAverage Score: 19.38
Episode 900\tAverage Score: 16.85


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,18.63


[34m[1mwandb[0m: Agent Starting Run: 59fbo7u5 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 19.12
Episode 200\tAverage Score: 21.55
Episode 300\tAverage Score: 21.29
Episode 400\tAverage Score: 21.81
Episode 500\tAverage Score: 21.06
Episode 600\tAverage Score: 22.54
Episode 700\tAverage Score: 20.48
Episode 800\tAverage Score: 21.14
Episode 900\tAverage Score: 19.91


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,23.53


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9fkqmblf with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 23.83
Episode 200\tAverage Score: 27.18
Episode 300\tAverage Score: 26.12
Episode 400\tAverage Score: 27.09
Episode 500\tAverage Score: 23.73
Episode 600\tAverage Score: 25.85
Episode 700\tAverage Score: 22.77
Episode 800\tAverage Score: 24.87
Episode 900\tAverage Score: 26.41


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,24.83


[34m[1mwandb[0m: Agent Starting Run: 566el5dm with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 500




Episode 100\tAverage Score: 17.70
Episode 200\tAverage Score: 21.15
Episode 300\tAverage Score: 23.83
Episode 400\tAverage Score: 28.27
Episode 500\tAverage Score: 42.85
Episode 600\tAverage Score: 68.90
Episode 700\tAverage Score: 127.30
Environment CartPole-v1 solved in 658 episodes!\tAverage Score: 195.07


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.07


[34m[1mwandb[0m: Agent Starting Run: id25re02 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 21.22
Episode 200\tAverage Score: 23.61
Episode 300\tAverage Score: 31.23
Episode 400\tAverage Score: 42.00
Episode 500\tAverage Score: 51.29
Episode 600\tAverage Score: 62.62
Episode 700\tAverage Score: 86.58
Episode 800\tAverage Score: 144.34
Environment CartPole-v1 solved in 749 episodes!\tAverage Score: 196.44


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,196.44


[34m[1mwandb[0m: Agent Starting Run: er0gvp2m with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 24.17
Episode 200\tAverage Score: 26.10
Episode 300\tAverage Score: 33.16
Episode 400\tAverage Score: 54.27
Episode 500\tAverage Score: 83.45
Episode 600\tAverage Score: 154.09
Environment CartPole-v1 solved in 532 episodes!\tAverage Score: 195.80


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.8


[34m[1mwandb[0m: Agent Starting Run: 8os3scvo with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 500




Episode 100\tAverage Score: 21.52
Episode 200\tAverage Score: 23.58
Episode 300\tAverage Score: 22.80
Episode 400\tAverage Score: 22.25
Episode 500\tAverage Score: 21.77
Episode 600\tAverage Score: 24.04
Episode 700\tAverage Score: 24.32
Episode 800\tAverage Score: 22.83
Episode 900\tAverage Score: 25.31


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,22.53


[34m[1mwandb[0m: Agent Starting Run: ifpdag5l with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 21.79
Episode 200\tAverage Score: 22.12
Episode 300\tAverage Score: 23.12
Episode 400\tAverage Score: 23.49
Episode 500\tAverage Score: 22.07
Episode 600\tAverage Score: 24.01
Episode 700\tAverage Score: 23.55
Episode 800\tAverage Score: 26.78
Episode 900\tAverage Score: 24.74


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,23.61


[34m[1mwandb[0m: Agent Starting Run: 7jptrn0e with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 20.20
Episode 200\tAverage Score: 21.19
Episode 300\tAverage Score: 23.78
Episode 400\tAverage Score: 22.05
Episode 500\tAverage Score: 22.46
Episode 600\tAverage Score: 22.22
Episode 700\tAverage Score: 21.07
Episode 800\tAverage Score: 22.87
Episode 900\tAverage Score: 27.07


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,25.37


[34m[1mwandb[0m: Agent Starting Run: nkx7ahyl with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 500




Episode 100\tAverage Score: 18.54
Episode 200\tAverage Score: 19.06
Episode 300\tAverage Score: 20.88
Episode 400\tAverage Score: 20.25
Episode 500\tAverage Score: 20.69
Episode 600\tAverage Score: 18.04
Episode 700\tAverage Score: 20.40
Episode 800\tAverage Score: 20.08
Episode 900\tAverage Score: 20.38


VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded\r'), FloatProgress(value=0.09303737102228388, max=1.…

0,1
final_score,20.71


[34m[1mwandb[0m: Agent Starting Run: chm1ymu3 with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 20.40
Episode 200\tAverage Score: 20.37
Episode 300\tAverage Score: 22.03
Episode 400\tAverage Score: 23.06
Episode 500\tAverage Score: 21.56
Episode 600\tAverage Score: 21.11
Episode 700\tAverage Score: 20.44
Episode 800\tAverage Score: 21.34
Episode 900\tAverage Score: 21.05


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,23.32


[34m[1mwandb[0m: Agent Starting Run: jvz0t9jm with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 20.43
Episode 200\tAverage Score: 19.64
Episode 300\tAverage Score: 21.08
Episode 400\tAverage Score: 21.02
Episode 500\tAverage Score: 19.76
Episode 600\tAverage Score: 19.52
Episode 700\tAverage Score: 22.31
Episode 800\tAverage Score: 19.84
Episode 900\tAverage Score: 22.48


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,22.45


[34m[1mwandb[0m: Agent Starting Run: yagkkpue with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 500




Episode 100\tAverage Score: 22.85
Episode 200\tAverage Score: 30.35
Episode 300\tAverage Score: 40.71
Episode 400\tAverage Score: 63.58
Episode 500\tAverage Score: 126.69
Environment CartPole-v1 solved in 453 episodes!\tAverage Score: 197.03


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,197.03


[34m[1mwandb[0m: Agent Starting Run: jkndl53a with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 25.75
Episode 200\tAverage Score: 35.55
Episode 300\tAverage Score: 41.95
Episode 400\tAverage Score: 74.07
Episode 500\tAverage Score: 171.12
Environment CartPole-v1 solved in 420 episodes!\tAverage Score: 195.33


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.33


[34m[1mwandb[0m: Agent Starting Run: ki8dlzcg with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 22.16
Episode 200\tAverage Score: 30.92
Episode 300\tAverage Score: 49.63
Episode 400\tAverage Score: 75.88
Episode 500\tAverage Score: 162.56
Environment CartPole-v1 solved in 416 episodes!\tAverage Score: 195.29


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.29


[34m[1mwandb[0m: Agent Starting Run: 5h41fnb7 with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 500




Episode 100\tAverage Score: 20.23
Episode 200\tAverage Score: 20.67
Episode 300\tAverage Score: 25.01
Episode 400\tAverage Score: 23.95
Episode 500\tAverage Score: 24.65
Episode 600\tAverage Score: 25.58
Episode 700\tAverage Score: 29.29
Episode 800\tAverage Score: 27.47
Episode 900\tAverage Score: 29.25


VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded\r'), FloatProgress(value=0.0930454387790496, max=1.0…

0,1
final_score,32.09


[34m[1mwandb[0m: Agent Starting Run: zy13scfg with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 25.98
Episode 200\tAverage Score: 26.21
Episode 300\tAverage Score: 25.73
Episode 400\tAverage Score: 27.40
Episode 500\tAverage Score: 29.04
Episode 600\tAverage Score: 32.14
Episode 700\tAverage Score: 36.32
Episode 800\tAverage Score: 31.79
Episode 900\tAverage Score: 32.37


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,34.4


[34m[1mwandb[0m: Agent Starting Run: wb38a2j9 with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 17.91
Episode 200\tAverage Score: 20.29
Episode 300\tAverage Score: 21.49
Episode 400\tAverage Score: 21.63
Episode 500\tAverage Score: 22.66
Episode 600\tAverage Score: 24.24
Episode 700\tAverage Score: 23.94
Episode 800\tAverage Score: 25.97
Episode 900\tAverage Score: 28.41


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,29.23


[34m[1mwandb[0m: Agent Starting Run: 1gvr3knv with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 500




Episode 100\tAverage Score: 22.39
Episode 200\tAverage Score: 21.30
Episode 300\tAverage Score: 22.06
Episode 400\tAverage Score: 22.06
Episode 500\tAverage Score: 22.91
Episode 600\tAverage Score: 22.47
Episode 700\tAverage Score: 23.80
Episode 800\tAverage Score: 22.48
Episode 900\tAverage Score: 23.91


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,23.82


[34m[1mwandb[0m: Agent Starting Run: 0ycmeuk6 with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 21.25
Episode 200\tAverage Score: 20.44
Episode 300\tAverage Score: 21.26
Episode 400\tAverage Score: 21.83
Episode 500\tAverage Score: 21.41
Episode 600\tAverage Score: 19.36
Episode 700\tAverage Score: 21.99
Episode 800\tAverage Score: 20.95
Episode 900\tAverage Score: 19.59


VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded\r'), FloatProgress(value=0.09866543948458352, max=1.…

0,1
final_score,22.87


[34m[1mwandb[0m: Agent Starting Run: 40yxu6wv with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 22.59
Episode 200\tAverage Score: 20.59
Episode 300\tAverage Score: 19.57
Episode 400\tAverage Score: 20.21
Episode 500\tAverage Score: 19.82
Episode 600\tAverage Score: 21.77
Episode 700\tAverage Score: 21.43
Episode 800\tAverage Score: 22.27
Episode 900\tAverage Score: 21.57


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,21.09


[34m[1mwandb[0m: Agent Starting Run: 8c7ono0d with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 500




Episode 100\tAverage Score: 26.91
Episode 200\tAverage Score: 43.28
Episode 300\tAverage Score: 63.30
Episode 400\tAverage Score: 125.27
Environment CartPole-v1 solved in 343 episodes!\tAverage Score: 198.08


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,198.08


[34m[1mwandb[0m: Agent Starting Run: cip11u1b with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 24.82
Episode 200\tAverage Score: 46.09
Episode 300\tAverage Score: 101.66
Environment CartPole-v1 solved in 296 episodes!\tAverage Score: 196.39


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,196.39


[34m[1mwandb[0m: Agent Starting Run: m8gzc2pb with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 24.81
Episode 200\tAverage Score: 34.81
Episode 300\tAverage Score: 65.53
Environment CartPole-v1 solved in 297 episodes!\tAverage Score: 195.40


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.4


[34m[1mwandb[0m: Agent Starting Run: l2tkzri9 with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 500




Episode 100\tAverage Score: 21.10
Episode 200\tAverage Score: 22.89
Episode 300\tAverage Score: 22.72
Episode 400\tAverage Score: 23.74
Episode 500\tAverage Score: 27.24
Episode 600\tAverage Score: 28.88
Episode 700\tAverage Score: 28.65
Episode 800\tAverage Score: 38.22
Episode 900\tAverage Score: 33.18


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,34.39


[34m[1mwandb[0m: Agent Starting Run: qj37dvsa with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100\tAverage Score: 21.30
Episode 200\tAverage Score: 21.30
Episode 300\tAverage Score: 21.12
Episode 400\tAverage Score: 20.81
Episode 500\tAverage Score: 23.64
Episode 600\tAverage Score: 23.12
Episode 700\tAverage Score: 26.21
Episode 800\tAverage Score: 30.07
Episode 900\tAverage Score: 30.97


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,35.08


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1kz8moh0 with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100\tAverage Score: 22.14
Episode 200\tAverage Score: 23.99
Episode 300\tAverage Score: 22.11
Episode 400\tAverage Score: 25.66
Episode 500\tAverage Score: 27.67
Episode 600\tAverage Score: 29.48
Episode 700\tAverage Score: 37.31
Episode 800\tAverage Score: 33.07
Episode 900\tAverage Score: 39.55


VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded\r'), FloatProgress(value=0.0934262080975185, max=1.0…

0,1
final_score,40.45


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [8]:
# wandb agent <sweep_id>