<a href="https://colab.research.google.com/github/sejal-godbole/MLOps/blob/main/wandb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Experiment Tracking Using Weights and Biases

In [1]:
# Log in to your W&B account
import wandb
import random
import math

In [2]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msejalgodbole02[0m ([33msejalgodbole02-viit-pune[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Simulate and track a machine learning experiment with W&B

In [3]:
import random
import math

# Launch 5 simulated experiments
total_runs = 5
for run in range(total_runs):
  # 1️. Start a new run to track this script
  wandb.init(
      # Set the project where this run will be logged
      project="basic-intro",
      # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      name=f"experiment_{run}",
      # Track hyperparameters and run metadata
      config={
      "learning_rate": 0.02,
      "architecture": "CNN",
      "dataset": "CIFAR-100",
      "epochs": 10,
      })

  # This simple block simulates a training loop logging metrics
  epochs = 10
  offset = random.random() / 5
  for epoch in range(2, epochs):
      acc = 1 - 2 ** -epoch - random.random() / epoch - offset
      loss = 2 ** -epoch + random.random() / epoch + offset

      # 2️. Log metrics from your script to W&B
      wandb.log({"acc": acc, "loss": loss})

  # Mark the run as finished
  wandb.finish()

0,1
acc,▁▃▃▇▇▇██
loss,█▅▃▁▂▁▁▁

0,1
acc,0.72543
loss,0.21791


0,1
acc,▁▁▇▆▅███
loss,█▃▅▅▂▁▂▁

0,1
acc,0.7654
loss,0.27176


0,1
acc,▃▁▇▅█▆▆█
loss,█▆▃▁▃▃▂▁

0,1
acc,0.80041
loss,0.17615


0,1
acc,▁▃█▇███▇
loss,█▃▄▁▂▂▂▁

0,1
acc,0.77294
loss,0.12302


0,1
acc,▁▅▆▆▇█▇█
loss,█▇▅▅▄▃▂▁

0,1
acc,0.98536
loss,0.02747


In [4]:
#@title
import torch, torchvision
import torch.nn as nn
from torchvision.datasets import MNIST
import torchvision.transforms as T

MNIST.mirrors = [mirror for mirror in MNIST.mirrors if "http://yann.lecun.com/" not in mirror]

device = "cuda:0" if torch.cuda.is_available() else "cpu"

def get_dataloader(is_train, batch_size, slice=5):
    "Get a training dataloader"
    full_dataset = MNIST(root=".", train=is_train, transform=T.ToTensor(), download=True)
    sub_dataset = torch.utils.data.Subset(full_dataset, indices=range(0, len(full_dataset), slice))
    loader = torch.utils.data.DataLoader(dataset=sub_dataset,
                                         batch_size=batch_size,
                                         shuffle=True if is_train else False,
                                         pin_memory=True, num_workers=2)
    return loader

def get_model(dropout):
    "A simple model"
    model = nn.Sequential(nn.Flatten(),
                         nn.Linear(28*28, 256),
                         nn.BatchNorm1d(256),
                         nn.ReLU(),
                         nn.Dropout(dropout),
                         nn.Linear(256,10)).to(device)
    return model

def validate_model(model, valid_dl, loss_func, log_images=False, batch_idx=0):
    "Compute performance of the model on the validation dataset and log a wandb.Table"
    model.eval()
    val_loss = 0.
    with torch.inference_mode():
        correct = 0
        for i, (images, labels) in enumerate(valid_dl):
            images, labels = images.to(device), labels.to(device)

            # Forward pass ➡
            outputs = model(images)
            val_loss += loss_func(outputs, labels)*labels.size(0)

            # Compute accuracy and accumulate
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

            # Log one batch of images to the dashboard, always same batch_idx.
            if i==batch_idx and log_images:
                log_image_table(images, predicted, labels, outputs.softmax(dim=1))
    return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)

### Create a table to compare the predicted values versus the true value


In [5]:
def log_image_table(images, predicted, labels, probs):
    "Log a wandb.Table with (img, pred, target, scores)"
    # Create a wandb Table to log images, labels and predictions to
    table = wandb.Table(columns=["image", "pred", "target"]+[f"score_{i}" for i in range(10)])
    for img, pred, targ, prob in zip(images.to("cpu"), predicted.to("cpu"), labels.to("cpu"), probs.to("cpu")):
        table.add_data(wandb.Image(img[0].numpy()*255), pred, targ, *prob.numpy())
    wandb.log({"predictions_table":table}, commit=False)

### Train your model and upload checkpoints


In [8]:
# Launch 3 experiments, trying different dropout rates
for _ in range(3):
    # initialise a wandb run
    wandb.init(
        project="pytorch-intro",
        config={
            "epochs": 5,
            "batch_size": 128,
            "lr": 1e-3,
            "dropout": random.uniform(0.01, 0.80),
            })

    # Copy your config
    config = wandb.config

    # Get the data
    train_dl = get_dataloader(is_train=True, batch_size=config.batch_size)
    valid_dl = get_dataloader(is_train=False, batch_size=2*config.batch_size)
    n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)

    # A simple MLP model
    model = get_model(config.dropout)

    # Make the loss and optimizer
    loss_func = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

   # Training
    example_ct = 0
    step_ct = 0
    for epoch in range(config.epochs):
        model.train()
        for step, (images, labels) in enumerate(train_dl):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            train_loss = loss_func(outputs, labels)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            example_ct += len(images)
            metrics = {"train/train_loss": train_loss,
                       "train/epoch": (step + 1 + (n_steps_per_epoch * epoch)) / n_steps_per_epoch,
                       "train/example_ct": example_ct}

            if step + 1 < n_steps_per_epoch:
                # Log train metrics to wandb
                wandb.log(metrics)

            step_ct += 1

        val_loss, accuracy = validate_model(model, valid_dl, loss_func, log_images=(epoch==(config.epochs-1)))

        # Log train and validation metrics to wandb
        val_metrics = {"val/val_loss": val_loss,
                       "val/val_accuracy": accuracy}
        wandb.log({**metrics, **val_metrics})

        wandb.alert(
          title="Low accuracy",
          text=f"Accuracy is below the acceptable threshold"
        )

        # Save the model checkpoint to wandb
        torch.save(model, "my_model.pt")
        wandb.log_model("./my_model.pt", "my_mnist_model", aliases=[f"epoch-{epoch+1}_dropout-{round(wandb.config.dropout, 4)}"])

        print(f"Epoch: {epoch+1}, Train Loss: {train_loss:.3f}, Valid Loss: {val_loss:3f}, Accuracy: {accuracy:.2f}")

    # If you had a test set, this is how you could log it as a Summary metric
    wandb.summary['test_accuracy'] = 0.8

    # Close your wandb run
    wandb.finish()

Epoch: 1, Train Loss: 0.397, Valid Loss: 0.314992, Accuracy: 0.91
Epoch: 2, Train Loss: 0.288, Valid Loss: 0.252926, Accuracy: 0.93
Epoch: 3, Train Loss: 0.337, Valid Loss: 0.226035, Accuracy: 0.93
Epoch: 4, Train Loss: 0.253, Valid Loss: 0.206577, Accuracy: 0.94
Epoch: 5, Train Loss: 0.249, Valid Loss: 0.200407, Accuracy: 0.94


0,1
train/epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇███
train/example_ct,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▇▇▇▇▇▇▇████
train/train_loss,█▅▄▄▃▄▄▃▃▂▃▂▂▂▂▁▂▂▂▁▂▂▃▁▂▂▁▂▁▁▂▁▂▂▂▂▁▂▂▁
val/val_accuracy,▁▅▆▇█
val/val_loss,█▄▃▁▁

0,1
test_accuracy,0.8
train/epoch,5.0
train/example_ct,60000.0
train/train_loss,0.2492
val/val_accuracy,0.939
val/val_loss,0.20041


Epoch: 1, Train Loss: 0.319, Valid Loss: 0.292666, Accuracy: 0.92
Epoch: 2, Train Loss: 0.150, Valid Loss: 0.216762, Accuracy: 0.94
Epoch: 3, Train Loss: 0.236, Valid Loss: 0.196579, Accuracy: 0.94
Epoch: 4, Train Loss: 0.143, Valid Loss: 0.180684, Accuracy: 0.95
Epoch: 5, Train Loss: 0.123, Valid Loss: 0.176554, Accuracy: 0.95


0,1
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇█
train/example_ct,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
train/train_loss,█▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/val_accuracy,▁▆▇██
val/val_loss,█▃▂▁▁

0,1
test_accuracy,0.8
train/epoch,5.0
train/example_ct,60000.0
train/train_loss,0.12294
val/val_accuracy,0.9465
val/val_loss,0.17655


Epoch: 1, Train Loss: 0.220, Valid Loss: 0.312997, Accuracy: 0.92
Epoch: 2, Train Loss: 0.355, Valid Loss: 0.252474, Accuracy: 0.93
Epoch: 3, Train Loss: 0.200, Valid Loss: 0.226625, Accuracy: 0.93
Epoch: 4, Train Loss: 0.183, Valid Loss: 0.215623, Accuracy: 0.93
Epoch: 5, Train Loss: 0.177, Valid Loss: 0.197125, Accuracy: 0.94


0,1
train/epoch,▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇█████
train/example_ct,▁▁▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
train/train_loss,█▅▃▃▂▂▂▂▂▁▁▂▁▁▂▂▁▂▂▂▁▁▂▂▁▂▁▁▁▂▂▁▁▁▁▁▁▁▁▁
val/val_accuracy,▁▅▅▆█
val/val_loss,█▄▃▂▁

0,1
test_accuracy,0.8
train/epoch,5.0
train/example_ct,60000.0
train/train_loss,0.17686
val/val_accuracy,0.9385
val/val_loss,0.19713


You have now trained your first model using W&B. Click on one of the links above to see your metrics and see your saved model checkpoints in the Artifacts tab in the W&B App UI

In [7]:
# Start a wandb run
wandb.init(project="pytorch-intro")

# Simulating a model training loop
acc_threshold = 0.3
for training_step in range(1000):

    # Generate a random number for accuracy
    accuracy = round(random.random() + random.random(), 3)
    print(f'Accuracy is: {accuracy}, {acc_threshold}')

    # Log accuracy to wandb
    wandb.log({"Accuracy": accuracy})

    # If the accuracy is below the threshold, fire a W&B Alert and stop the run
    if accuracy <= acc_threshold:
        # Send the wandb Alert
        wandb.alert(
            title='Low Accuracy',
            text=f'Accuracy {accuracy} at step {training_step} is below the acceptable theshold, {acc_threshold}',
        )
        print('Alert triggered')
        break

# Mark the run as finished (useful in Jupyter notebooks)
wandb.finish()

Accuracy is: 0.903, 0.3
Accuracy is: 0.461, 0.3
Accuracy is: 0.089, 0.3
Alert triggered


0,1
Accuracy,█▄▁

0,1
Accuracy,0.089
