# datasets

In [3]:
from datasets import load_dataset

In [None]:
raw_dataset = load_dataset("glue", "mrpc")
raw_dataset

In [None]:
raw_dataset['train']

In [6]:
train = raw_dataset["train"]

In [None]:
train.features

In [None]:
train[15]

In [None]:
raw_dataset['validation'][87]

In [10]:
from huggingface_hub import list_models

In [None]:
bert = list_models(filter="bert", author="google-bert")
print([i for i in bert],sep="\n")

In [12]:
CHECKPOINT = "google-bert/bert-base-uncased"

In [13]:
from transformers.models.auto import AutoTokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

In [None]:
print(train[15]['sentence1'])
print(train[15]['sentence2'])

In [None]:
tokenized_pairs = tokenizer(
    # "How to enter sentence pairs?",
    # "I guess we'll never know",
    train[15]['sentence1'],
    train[15]['sentence2'],
    padding=True,
    truncation=True,
    # return_tensors="pt"
    )
tokenized_pairs

In [None]:
tokenizer.convert_ids_to_tokens(tokenized_pairs['input_ids'])

In [18]:
def tokenize_function(data):
    return tokenizer(
        data['sentence1'],
        data['sentence2'],
        # padding=True, # padding by batch >> padding by entire dataset
        truncation=True
        # return_tensors="pt"
    )

## batch the dataset

use `batched`=`True` in the `map()` function

In [None]:
tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets

# dynamic padding

batchwise padding to avoid long pads

In [21]:
from transformers.data import DataCollatorWithPadding

In [None]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples['input_ids']]

In [34]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

# practice

use `sst-2` dataset

In [None]:
sst_2 = load_dataset("glue","sst2")

In [None]:
sst_2['train'][1]

In [None]:
l = list_models(filter="distilbert", author="distilbert")
for i in l:
    print(i.id)

In [48]:
SST_CHECKPOINT = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

In [None]:
sst_tokenizer = AutoTokenizer.from_pretrained(SST_CHECKPOINT)

In [53]:
def sst_tokenize(data):
    return sst_tokenizer(
        data['sentence'],
        truncation=True
    )

In [None]:
tokenized_sst = sst_2.map(sst_tokenize, batched=True)

In [55]:
sst_collator = DataCollatorWithPadding(tokenizer=sst_tokenizer)

In [None]:
sst_samples = tokenized_sst['train'][:12]
sst_samples = {k: v for k, v in sst_samples.items() if k not in ['sentence', 'idx']}
[len(x) for x in sst_samples['input_ids']]

In [None]:
batched_sst = sst_collator(sst_samples)
[len(x) for x in batched_sst['input_ids']]

available configs: ['ax', 'cola', 'mnli', 'mnli_matched', 'mnli_mismatched', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli']

In [76]:
ax = load_dataset("glue", "ax")

In [None]:
cola = load_dataset("glue", "cola")

In [None]:
train

In [None]:
sst_2['train']

In [None]:
ax['test']

In [None]:
len(cola['train'].features)

# practice 2

generalized GLUE tokenizer

In [90]:
def glue_tokenizer(data):
    if len(data.keys()) == 3:
        return sst_tokenize(data)
    return tokenize_function(data)

In [None]:
tokenized_cola = cola.map(glue_tokenizer, batched=True)

In [None]:
tokenized_cola

In [None]:
tokenized_mrpc = raw_dataset.map(glue_tokenizer, batched=True)

In [None]:
tokenized_mrpc

# Fine Tuning

`Trainer` API

Apart from self-attention heads, transformers have a task specific head layer on top of pretrained model

- Sequence classification head
- Token classification head
- Question-answering head , etc.

These "**heads**" are usually just feed forward network layers on top of the pretrained model

## `TrainingArguements`

contains all the hyperparams for the `Trainer` class to use

required arg: path to save trained model and intermediate checkpoints

In [None]:
# load from transformer lib
from transformers import (
    TrainingArguments, # hyperparameters
    Trainer, # training driver
    AutoTokenizer, # tokenization
    AutoModelForSequenceClassification, # model download
    DataCollatorWithPadding
)

In [2]:
# dataset downloader
from datasets import load_dataset

In [3]:
mrpc = load_dataset(
    path="glue",
    name="mrpc"
    )

In [None]:
CHECKPOINT = "google-bert/bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels = 2)
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
def tokenize_mrpc(data):
    return tokenizer(
        data["sentence1"],
        data["sentence2"],
        truncation=True
    )

In [None]:
tokenized_mrpc = mrpc.map(tokenize_mrpc, batched=True)

In [None]:
training_args = TrainingArguments(output_dir="test-trainer",report_to="none") # avoid codecarbon


trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_mrpc["train"],
    eval_dataset=tokenized_mrpc["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [None]:
trainer.train()