# Installing libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [1]:
!pip install datasets transformers scikit-learn torch pandas evaluate tensorboardX

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.2 MB/s[0m eta [3

# Dataset processing

Uploading the dataset, splitting the data into train and validation sets

In [2]:
import pandas as pd
from datasets import DatasetDict, Dataset


data = pd.read_json('./pages/unreliable-sources.json', encoding='utf-8')

train_data = data.sample(frac=0.8, random_state=42)
validation_data = data.drop(train_data.index)

dataset = DatasetDict(
    {'train': Dataset.from_pandas(train_data).shuffle(),
     'validation': Dataset.from_pandas(validation_data).shuffle()
     })

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['page_id', 'revision_id', 'title', 'content', 'has_template', '__index_level_0__'],
        num_rows: 2098
    })
    validation: Dataset({
        features: ['page_id', 'revision_id', 'title', 'content', 'has_template', '__index_level_0__'],
        num_rows: 525
    })
})

In [4]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

def builder(model_name, raw_dataset):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def tokenize_function(sample):
      return tokenizer(sample["content"], truncation=True)

  def load_model():
    return AutoModelForSequenceClassification.from_pretrained(model_name)

  tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
  tokenized_datasets = tokenized_datasets.remove_columns(["content", "__index_level_0__"])
  tokenized_datasets = tokenized_datasets.rename_column("has_template", "labels")
  tokenized_datasets.set_format("torch")
  return load_model, tokenized_datasets, data_collator, tokenizer

load_model, tokenized_datasets, data_collator, tokenizer = builder("google/bigbird-roberta-base", dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]



Map:   0%|          | 0/2098 [00:00<?, ? examples/s]

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

In [5]:
from dataclasses import dataclass

@dataclass
class HyperParameters:
    batch_size: int
    number_of_epochs: int
    learning_rate: float
    warm_up_steps: int

In [6]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {
      "accuracy": acc,
      "f1": f1,
      "precision": precision,
      "recall": recall
    }

In [7]:
def get_training_results(model, hyperparameters):
    args = TrainingArguments(
        output_dir=f"./models/",
        evaluation_strategy = "epoch",
        save_strategy="epoch",
        learning_rate=hyperparameters.learning_rate,
        per_device_train_batch_size=hyperparameters.batch_size,
        per_device_eval_batch_size=hyperparameters.batch_size,
        num_train_epochs=hyperparameters.number_of_epochs,
        report_to='tensorboard',
        logging_dir=f'./logs/',
        load_best_model_at_end=True,
        weight_decay=0.01
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    for name, param in model.named_parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()
    trainer.train()
    trainer.save_model()
    results = trainer.evaluate()
    return results, model

In [8]:
def optimize(combinations, load_model, tokenized_datasets):
  batch_sizes = {hyperparameters.batch_size: hyperparameters for hyperparameters in combinations}
  max_accuracy = 0
  max_hyperparameters = None
  for batch_size in batch_sizes.keys():
    for hyperparameters in combinations:
      if batch_size != hyperparameters.batch_size:
        continue

      model = load_model()
      model.to(device)
      print(f"============== Hyperparameters: {hyperparameters} ==============")
      results, model = get_training_results(model, hyperparameters)
      if results["eval_accuracy"] > max_accuracy:
        max_accuracy = results["eval_accuracy"]
        max_hyperparameters = hyperparameters

      print(f"Validation accuracy: {results}")

  return max_hyperparameters

In [9]:
import torch
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_sizes = [1]
number_of_epochs = [5]
learning_rates = [3e-5]
warmup_steps = [2]

combinations = [
    HyperParameters(batch_size, number_of_epoch, learning_rate, warmup_step)
    for batch_size in batch_sizes
    for number_of_epoch in number_of_epochs
    for learning_rate in learning_rates
    for warmup_step in warmup_steps
]

optimal_hyperparameters = optimize(combinations, load_model, tokenized_datasets)

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Attention type 'block_sparse' is not possible if sequence_length: 446 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7506,0.698875,0.474286,0.305161,0.224947,0.474286
2,0.7777,0.70569,0.474286,0.305161,0.224947,0.474286
3,0.7004,0.714189,0.474286,0.305161,0.224947,0.474286
4,0.9272,0.756128,0.474286,0.305161,0.224947,0.474286
5,0.7851,0.784176,0.474286,0.305161,0.224947,0.474286


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation accuracy: {'eval_loss': 0.6988745331764221, 'eval_accuracy': 0.4742857142857143, 'eval_f1': 0.3051605758582503, 'eval_precision': 0.22494693877551022, 'eval_recall': 0.4742857142857143, 'eval_runtime': 21.4712, 'eval_samples_per_second': 24.451, 'eval_steps_per_second': 24.451, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
