In [1]:
!pip install wandb
import wandb
# Replace with your actual API key
api_key = "8f58df9a66485e9ea9149b8b599cb14eb71832dc"

# Login to Weights & Biases
wandb.login(key=api_key)

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.44.1-py2.py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
torch.manual_seed(0)
import base64
import io  # For visualization
from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display
import glob


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

env = gym.make('CartPole-v1')
env.seed(0)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

wandb.init(project="cartpole-reinforce", config={  # Initialize wandb first
    "hidden_size": 32,  # Default values
    "n_episodes": 1000,
    "max_t": 1000,
    "gamma": 0.99,
    "print_every": 100,
    "lr": 1e-2,
})

config = {
    "hidden_size": wandb.config.hidden_size,  # Access wandb.config after initialization
    "n_episodes": wandb.config.n_episodes,
    "max_t": wandb.config.max_t,
    "gamma": wandb.config.gamma,
    "print_every": wandb.config.print_every,
    "lr": wandb.config.lr,
}

class Policy(nn.Module):
    def __init__(self, state_size=4, action_size=2, hidden_size=config["hidden_size"]):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        # we just consider 1 dimensional probability of action
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action)

def reinforce(policy, optimizer, n_episodes=2000, max_t=1000, gamma=1.0, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for e in range(1, n_episodes):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Collect trajectory
        for t in range(max_t):
            # Sample the action from current policy
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        # Calculate total expected reward
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        # Recalculate the total reward applying discounted factor
        discounts = [gamma ** i for i in range(len(rewards) + 1)]
        R = sum([a * b for a,b in zip(discounts, rewards)])

        # Calculate the loss
        policy_loss = []
        for log_prob in saved_log_probs:
            # Note that we are using Gradient Ascent, not Descent. So we need to calculate it with negative rewards.
            policy_loss.append(-log_prob * R)
        # After that, we concatenate whole policy loss in 0th dimension
        policy_loss = torch.cat(policy_loss).sum()

        # Backpropagation
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if e % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
        if np.mean(scores_deque) >= 195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(e - 100, np.mean(scores_deque)))
            break
    return scores

############################################For Regret minimization###################################

# def reinforce(policy, optimizer, n_episodes=config["n_episodes"], max_t=config["max_t"], gamma=config["gamma"], print_every=config["print_every"]):
#     scores_deque = deque(maxlen=100)
#     scores = []
#     for e in range(1, n_episodes):
#         # ... (the rest of the code remains the same)
#         if e % print_every == 0:
#             print('Episode {}\\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
#             regret = 500 - np.mean(scores_deque)  # Calculate regret
#             wandb.log({"Regret": regret})
#         if np.mean(scores_deque) >= 495.0:  # Adjust the threshold for early stopping
#             print('Environment solved in {:d} episodes!\\tAverage Score: {:.2f}'.format(e - 100, np.mean(scores_deque)))
#             break
#     wandb.run.summary["final_regret"] = 500 - np.mean(scores_deque)  # Log final regret
#     return scores




#############################################################################

wandb.init(project="cartpole-reinforce", config=config)  # Initialize wandb

policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=config["lr"])
scores = reinforce(policy, optimizer)
wandb.finish()  # Finish wandb run

  deprecation(
  deprecation(
  deprecation(
[34m[1mwandb[0m: Currently logged in as: [33mbhavik-160990105023[0m. Use [1m`wandb login --relogin`[0m to force relogin


cpu
observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
action space: Discrete(2)


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=0.9396085740913327, max=1.0…

  if not isinstance(terminated, (bool, np.bool8)):


Episode 100	Average Score: 40.21
Episode 200	Average Score: 97.11
Episode 300	Average Score: 44.21
Episode 400	Average Score: 40.40
Episode 500	Average Score: 39.88
Episode 600	Average Score: 46.83
Episode 700	Average Score: 69.05
Episode 800	Average Score: 86.48
Episode 900	Average Score: 118.85
Episode 1000	Average Score: 174.49
Environment solved in 908 episodes!	Average Score: 195.61


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [3]:
sweep_config = {
    'method': 'grid',
    'metric': {
      'name': 'Average Reward',
      'goal': 'maximize'
    },
    'parameters': {
        'hidden_size': {
            'values': [32,64,128]
        },
        'max_t': {
            'values': [500, 1000, 1500]
        },
        'lr': {
            'values': [1e-2, 1e-3,1e-4]
        }
    }
}

  and should_run_async(code)


In [4]:
# import wandb

# def train(config=None):
#     with wandb.init(config=config):
#         config = wandb.config

#         # Define the hyperparameters
#         hidden_size = config.hidden_size
#         n_episodes = config.n_episodes
#         max_t = config.max_t
#         lr = config.lr

#         # Initialize the policy and optimizer
#         policy = Policy(hidden_size=hidden_size).to(device)
#         optimizer = optim.Adam(policy.parameters(), lr=lr)

#         def reinforce(policy, optimizer, n_episodes=n_episodes, max_t=max_t, gamma=1.0, print_every=100):
#             scores_deque = deque(maxlen=100)
#             scores = []

#             for e in range(1, n_episodes+1):
#                 saved_log_probs = []
#                 rewards = []
#                 state = env.reset()

#                 # Collect trajectory
#                 for t in range(max_t):
#                     # Sample the action from current policy
#                     action, log_prob = policy.act(state)
#                     saved_log_probs.append(log_prob)
#                     state, reward, done, _ = env.step(action)
#                     rewards.append(reward)
#                     if done:
#                         break

#                 # Calculate the total regret
#                 scores_deque.append(sum(rewards))
#                 scores.append(sum(rewards))

#                 # Recalculate the total regret applying discounted factor
#                 discounts = [gamma ** i for i in range(len(rewards) + 1)]
#                 R = sum([a * b for a, b in zip(discounts, rewards)])

#                 # Calculate the loss
#                 policy_loss = []
#                 for log_prob in saved_log_probs:
#                     # Minimizing the regret instead of maximizing the reward
#                     policy_loss.append(log_prob * R)
#                 policy_loss = torch.cat(policy_loss).sum()

#                 # Backpropagation
#                 optimizer.zero_grad()
#                 policy_loss.backward()
#                 optimizer.step()

#                 if e % print_every == 0:
#                     print(f'Episode {e}\tAverage Regret: {np.mean(scores_deque):.2f}')

#                 wandb.log({"regret": np.mean(scores_deque)})

#             return scores

#         scores = reinforce(policy, optimizer)
#         wandb.run.summary["best_regret"] = np.min(scores)

# # Run the sweep
# sweep_id = wandb.sweep(sweep_config, entity="bhavik-160990105023", project="assignment2")
# wandb.agent(sweep_id, train, count=20)

In [5]:
# sweep_config = {
#     'method': 'random',
#     'metric': {
#       'name': 'final_regret',
#       'goal': 'minimize'
#     },
#     'parameters': {
#         'hidden_size':{
#             'values': [1000, 2000, 3000]
#         },
#         'n_episodes': {
#             'values': [0, 0.0005, 0.005]
#         },
#         'max_t': {
#             'values': [500, 1000, 1500]
#         },
#         'lr': {
#             'values': [1e-2, 1e-3, 1e-4]
#         }
#     }
# }

In [6]:
# Initialize a new sweep
sweep_id = wandb.sweep(sweep_config, entity="bhavik-160990105023", project="assignment2")

Create sweep with ID: ilvbh3p3
Sweep URL: https://wandb.ai/bhavik-160990105023/assignment2/sweeps/ilvbh3p3


In [7]:
def train(config=None):
    # Set default values for hyperparameters
    default_config = {
        "hidden_size": 32,
        "n_episodes": 1000,
        "max_t": 1000,
        "gamma": 1.0,
        "print_every": 100,
        "lr": 1e-2,
    }

    # Initialize a new wandb run
    run = wandb.init(config=config, reinit=True)

    # If config is None, use the default values
    if config is None:
        run.config.update(default_config, allow_val_change=True)
        config = run.config

    # Initialize the environment and seed
    env = gym.make('CartPole-v1')
    env.seed(0)

    # Create the policy network with the specified hyperparameters
    policy = Policy(state_size=4, action_size=2, hidden_size=config.hidden_size).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=config.lr)

    # Run the REINFORCE algorithm with the specified hyperparameters
    scores = reinforce(policy, optimizer, n_episodes=config.n_episodes, max_t=config.max_t, gamma=config.gamma, print_every=config.print_every)

    # Log the final score as a summary metric
    run.summary["final_score"] = np.mean(scores[-100:])

    # Finish the wandb run
    run.finish()

    # Return any necessary values or metrics
    return np.mean(scores[-100:])

In [8]:
sweep_id = wandb.sweep(sweep_config, project="cartpole-reinforce WB")
wandb.agent(sweep_id, train, count=50)

Create sweep with ID: e5bfvrli
Sweep URL: https://wandb.ai/bhavik-160990105023/cartpole-reinforce%20WB/sweeps/e5bfvrli


[34m[1mwandb[0m: Agent Starting Run: ai7l0bkq with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 500


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112777466666696, max=1.0…

  deprecation(
  deprecation(
  deprecation(


Episode 100	Average Score: 33.13
Episode 200	Average Score: 34.83
Episode 300	Average Score: 59.76
Episode 400	Average Score: 39.80
Episode 500	Average Score: 57.38
Episode 600	Average Score: 60.30
Episode 700	Average Score: 64.93
Episode 800	Average Score: 52.28
Episode 900	Average Score: 41.01


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,54.53


[34m[1mwandb[0m: Agent Starting Run: 7xrqgsxi with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 10.02
Episode 200	Average Score: 29.97
Episode 300	Average Score: 62.18
Episode 400	Average Score: 61.25
Episode 500	Average Score: 74.59
Environment solved in 474 episodes!	Average Score: 196.34


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,196.34


[34m[1mwandb[0m: Agent Starting Run: 3mgqekf1 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 15.39
Episode 200	Average Score: 34.76
Episode 300	Average Score: 33.48
Episode 400	Average Score: 30.43
Episode 500	Average Score: 46.52
Episode 600	Average Score: 77.46
Episode 700	Average Score: 71.66
Episode 800	Average Score: 90.55
Episode 900	Average Score: 70.74


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,69.72


[34m[1mwandb[0m: Agent Starting Run: ducmhux5 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 500




Episode 100	Average Score: 20.85
Episode 200	Average Score: 26.81
Episode 300	Average Score: 26.28
Episode 400	Average Score: 25.62
Episode 500	Average Score: 30.96
Episode 600	Average Score: 35.48
Episode 700	Average Score: 42.17
Episode 800	Average Score: 49.09
Episode 900	Average Score: 53.72


VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded\r'), FloatProgress(value=0.10014768321949419, max=1.…

0,1
final_score,56.19


[34m[1mwandb[0m: Agent Starting Run: t14vsiv6 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 19.49
Episode 200	Average Score: 19.67
Episode 300	Average Score: 24.87
Episode 400	Average Score: 25.44
Episode 500	Average Score: 33.57
Episode 600	Average Score: 32.99
Episode 700	Average Score: 39.24
Episode 800	Average Score: 48.96
Episode 900	Average Score: 51.44


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,53.75


[34m[1mwandb[0m: Agent Starting Run: nz5jwsp6 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 26.46
Episode 200	Average Score: 26.14
Episode 300	Average Score: 27.82
Episode 400	Average Score: 28.79
Episode 500	Average Score: 33.11
Episode 600	Average Score: 37.44
Episode 700	Average Score: 44.53
Episode 800	Average Score: 53.87
Episode 900	Average Score: 60.36


VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded\r'), FloatProgress(value=0.10013844023996309, max=1.…

0,1
final_score,75.23


[34m[1mwandb[0m: Agent Starting Run: i48e6idm with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 500




Episode 100	Average Score: 21.21
Episode 200	Average Score: 25.69
Episode 300	Average Score: 21.92
Episode 400	Average Score: 25.01
Episode 500	Average Score: 23.03
Episode 600	Average Score: 22.84
Episode 700	Average Score: 23.75
Episode 800	Average Score: 24.48
Episode 900	Average Score: 23.78


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,24.11


[34m[1mwandb[0m: Agent Starting Run: 14mcsn04 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 24.86
Episode 200	Average Score: 21.45
Episode 300	Average Score: 24.01
Episode 400	Average Score: 26.40
Episode 500	Average Score: 28.41
Episode 600	Average Score: 24.95
Episode 700	Average Score: 28.15
Episode 800	Average Score: 26.64
Episode 900	Average Score: 25.25


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,28.0


[34m[1mwandb[0m: Agent Starting Run: 8g4pnnh4 with config:
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 20.71
Episode 200	Average Score: 21.57
Episode 300	Average Score: 22.60
Episode 400	Average Score: 21.64
Episode 500	Average Score: 21.54
Episode 600	Average Score: 23.75
Episode 700	Average Score: 21.61
Episode 800	Average Score: 23.76
Episode 900	Average Score: 23.85


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,23.26


[34m[1mwandb[0m: Agent Starting Run: ekaig85c with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 500




Episode 100	Average Score: 41.39
Episode 200	Average Score: 49.83
Episode 300	Average Score: 84.34
Episode 400	Average Score: 85.80
Episode 500	Average Score: 91.93
Episode 600	Average Score: 141.22
Episode 700	Average Score: 70.74
Episode 800	Average Score: 38.86
Episode 900	Average Score: 33.83


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,36.4


[34m[1mwandb[0m: Agent Starting Run: onb8pd3l with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 15.68
Episode 200	Average Score: 26.34
Episode 300	Average Score: 35.23
Episode 400	Average Score: 60.55
Episode 500	Average Score: 51.47
Episode 600	Average Score: 39.21
Episode 700	Average Score: 50.85
Episode 800	Average Score: 74.84
Episode 900	Average Score: 75.76


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,82.82


[34m[1mwandb[0m: Agent Starting Run: 486qd720 with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 10.06
Episode 200	Average Score: 10.14
Episode 300	Average Score: 10.13
Episode 400	Average Score: 9.40
Episode 500	Average Score: 9.45
Episode 600	Average Score: 9.40
Episode 700	Average Score: 9.32
Episode 800	Average Score: 9.50
Episode 900	Average Score: 9.56


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,9.36


[34m[1mwandb[0m: Agent Starting Run: xgobnypj with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 500




Episode 100	Average Score: 24.71
Episode 200	Average Score: 29.81
Episode 300	Average Score: 39.38
Episode 400	Average Score: 41.46
Episode 500	Average Score: 42.66
Episode 600	Average Score: 54.16
Episode 700	Average Score: 68.24
Episode 800	Average Score: 64.95
Episode 900	Average Score: 90.19


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,136.24


[34m[1mwandb[0m: Agent Starting Run: 70ulwcry with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 25.42
Episode 200	Average Score: 26.12
Episode 300	Average Score: 36.62
Episode 400	Average Score: 45.53
Episode 500	Average Score: 81.95
Episode 600	Average Score: 88.04
Episode 700	Average Score: 121.21
Episode 800	Average Score: 112.05
Episode 900	Average Score: 118.36


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,184.83


[34m[1mwandb[0m: Agent Starting Run: 4j3zzlxn with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 24.60
Episode 200	Average Score: 38.03
Episode 300	Average Score: 51.18
Episode 400	Average Score: 66.69
Episode 500	Average Score: 86.96
Episode 600	Average Score: 137.32
Episode 700	Average Score: 164.99
Environment solved in 634 episodes!	Average Score: 195.32


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.32


[34m[1mwandb[0m: Agent Starting Run: 9aur4gmv with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 500




Episode 100	Average Score: 21.57
Episode 200	Average Score: 24.62
Episode 300	Average Score: 21.50
Episode 400	Average Score: 21.87
Episode 500	Average Score: 25.56
Episode 600	Average Score: 24.31
Episode 700	Average Score: 27.56
Episode 800	Average Score: 26.36
Episode 900	Average Score: 24.25


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,27.72


[34m[1mwandb[0m: Agent Starting Run: oatspauy with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 19.06
Episode 200	Average Score: 19.17
Episode 300	Average Score: 19.09
Episode 400	Average Score: 18.32
Episode 500	Average Score: 18.91
Episode 600	Average Score: 19.94
Episode 700	Average Score: 19.97
Episode 800	Average Score: 22.01
Episode 900	Average Score: 20.87


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,22.28


[34m[1mwandb[0m: Agent Starting Run: 0mrxd7id with config:
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 19.34
Episode 200	Average Score: 21.40
Episode 300	Average Score: 21.27
Episode 400	Average Score: 23.56
Episode 500	Average Score: 20.22
Episode 600	Average Score: 21.41
Episode 700	Average Score: 21.79
Episode 800	Average Score: 24.03
Episode 900	Average Score: 23.51


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,23.57


[34m[1mwandb[0m: Agent Starting Run: wx0e66n2 with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 500




Episode 100	Average Score: 13.00
Episode 200	Average Score: 29.89
Episode 300	Average Score: 48.51
Episode 400	Average Score: 120.59
Episode 500	Average Score: 105.74
Episode 600	Average Score: 62.64
Episode 700	Average Score: 58.75
Episode 800	Average Score: 62.79
Episode 900	Average Score: 79.91
Environment solved in 847 episodes!	Average Score: 195.25


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.25


[34m[1mwandb[0m: Agent Starting Run: 3to1qiik with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 33.28
Episode 200	Average Score: 35.70
Episode 300	Average Score: 16.90
Episode 400	Average Score: 11.82
Episode 500	Average Score: 9.91
Episode 600	Average Score: 9.32
Episode 700	Average Score: 9.29
Episode 800	Average Score: 9.39
Episode 900	Average Score: 9.28


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,9.35


[34m[1mwandb[0m: Agent Starting Run: z9palkrn with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 15.72
Episode 200	Average Score: 31.66
Episode 300	Average Score: 54.50
Episode 400	Average Score: 42.05
Episode 500	Average Score: 71.79
Environment solved in 496 episodes!	Average Score: 195.43


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.43


[34m[1mwandb[0m: Agent Starting Run: s3i3vuv1 with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 500




Episode 100	Average Score: 29.21
Episode 200	Average Score: 33.69
Episode 300	Average Score: 46.78
Episode 400	Average Score: 58.74
Episode 500	Average Score: 64.73
Episode 600	Average Score: 90.82
Episode 700	Average Score: 127.93
Episode 800	Average Score: 89.89
Environment solved in 770 episodes!	Average Score: 196.62


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,196.62


[34m[1mwandb[0m: Agent Starting Run: bcntg4xo with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 19.72
Episode 200	Average Score: 41.33
Episode 300	Average Score: 49.73
Episode 400	Average Score: 72.58
Episode 500	Average Score: 74.76
Episode 600	Average Score: 51.24
Episode 700	Average Score: 67.47
Episode 800	Average Score: 83.99
Episode 900	Average Score: 154.48


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,155.13


[34m[1mwandb[0m: Agent Starting Run: 5fvp55eh with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 25.99
Episode 200	Average Score: 39.19
Episode 300	Average Score: 69.59
Episode 400	Average Score: 96.30
Episode 500	Average Score: 114.43
Episode 600	Average Score: 186.95
Environment solved in 518 episodes!	Average Score: 195.53


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,195.53


[34m[1mwandb[0m: Agent Starting Run: hzj32iar with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 500




Episode 100	Average Score: 24.27
Episode 200	Average Score: 23.19
Episode 300	Average Score: 26.37
Episode 400	Average Score: 26.05
Episode 500	Average Score: 28.37
Episode 600	Average Score: 30.67
Episode 700	Average Score: 27.80
Episode 800	Average Score: 34.23
Episode 900	Average Score: 29.68


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,33.01


[34m[1mwandb[0m: Agent Starting Run: q2j40qox with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1000




Episode 100	Average Score: 22.30
Episode 200	Average Score: 22.55
Episode 300	Average Score: 24.31
Episode 400	Average Score: 22.52
Episode 500	Average Score: 25.74
Episode 600	Average Score: 29.26
Episode 700	Average Score: 28.02
Episode 800	Average Score: 28.24
Episode 900	Average Score: 30.45


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
final_score,33.39


[34m[1mwandb[0m: Agent Starting Run: p4n634y7 with config:
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	max_t: 1500




Episode 100	Average Score: 26.79
Episode 200	Average Score: 29.19
Episode 300	Average Score: 25.27
Episode 400	Average Score: 26.25
Episode 500	Average Score: 27.32
Episode 600	Average Score: 29.61
Episode 700	Average Score: 29.04
Episode 800	Average Score: 33.03
Episode 900	Average Score: 33.62


VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded\r'), FloatProgress(value=0.09445120890589667, max=1.…

0,1
final_score,38.03


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [9]:
# wandb agent <sweep_id>