In [None]:
# Install necessary libraries
!pip install datasets transformers huggingface_hub evaluate optuna
!apt-get install git-lfs

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, 

In [None]:
import os
from huggingface_hub import login as hf_login
import subprocess
import wandb
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, AutoTokenizer, AutoModel, RobertaConfig
import evaluate
from sklearn.model_selection import train_test_split
import yaml

We setup the automatic login for Hugging Face and Weights and Biases with the secret token and API key.

In [None]:
# Auto-login for Hugging Face and Weights & Biases

hf_token = os.environ.get("HF_TOKEN")
wandb_api_key = os.environ.get("WANDB_API_KEY")


hf_login(token=hf_token)
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

wandb.login(key=wandb_api_key)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mso-907[0m ([33mso-907-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

Let's load the training and evaluation dataset and tokenize the comments.

In [None]:
train_dataset = pd.read_csv("train.csv")
eval_dataset = pd.read_csv("test.csv")

# Turn into dataset object
train_dataset = Dataset.from_pandas(train_dataset)
eval_dataset = Dataset.from_pandas(eval_dataset)

# Instantiate tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base", num_labels=3)

def tokenize_function(examples):
  return tokenizer(examples["Comment"],
                   padding="max_length",
                   truncation=True,
                   max_length=512
                   )

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/14299 [00:00<?, ? examples/s]

Map:   0%|          | 0/3575 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
  load_accuracy = evaluate.load("accuracy")
  load_f1 = evaluate.load("f1")

  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
  f1 = load_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]

  return {"accuracy": accuracy, "f1": f1}

Now we load the configuration of the best model that we had saved on Weights & Biases and instantiate a new model with the same parameters.

In [None]:
def unwrap_values(d):
    if isinstance(d, dict):
        if "value" in d and len(d) == 1:
            return unwrap_values(d["value"])
        else:
            return {k: unwrap_values(v) for k, v in d.items()}
    elif isinstance(d, list):
        return [unwrap_values(x) for x in d]
    else:
        return d

# Load yaml file
with open("config.yaml", "r") as f:
    config_dict = yaml.safe_load(f)

# Clean up the dictionary
config_dict = unwrap_values(config_dict)

# Create configuration
config = RobertaConfig(**config_dict)

# Create model
model = AutoModel.from_pretrained(
    "roberta-base",
    config=config
)

def model_init():
    return AutoModel.from_pretrained(
    "roberta-base",
    config=config
)

Now we save the model as an artifact on Weights & Biases.

In [None]:
model.save_pretrained("youtube-roberta-base")
tokenizer.save_pretrained("youtube-roberta-base")

wandb.init(project="huggingface", name="roberta-base-replica")

artifact = wandb.Artifact("youtube-roberta-base", type="model")
artifact.add_dir("youtube-roberta-base")
wandb.log_artifact(artifact)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[34m[1mwandb[0m: Adding directory to artifact (./youtube-roberta-base)... Done. 9.5s


<Artifact youtube-roberta-base>