In [1]:
import torch
from transformers import pipeline, AutoTokenizer, set_seed
from transformers import GPT2Tokenizer, GPT2Model

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# specifying device_map="auto" will be good enough as 🤗 Accelerate will attempt to fill all the space in your GPU(s), then loading them to the CPU, 
# and finally if there is not enough RAM it will be loaded to the disk

pipe = pipeline('text-generation', model='gpt2', device_map="auto")
output = pipe("This is a cool example!", do_sample=True, top_p=0.95, max_length=30, truncation=True, num_return_sequences=5)
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [14]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
sequence = "hello world"
encoded_input = tokenizer(sequence, return_tensors='pt')
print(encoded_input)

model = GPT2Model.from_pretrained('gpt2')
output = model(**encoded_input)
# print(output)

{'input_ids': tensor([[31373,   995]]), 'attention_mask': tensor([[1, 1]])}


In [15]:
decoded_sequence = tokenizer.decode(encoded_input["input_ids"][0].tolist())
print(decoded_sequence)

hello world


# Trainer

In [6]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification
from datasets import load_dataset
import os

def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
os.environ["WANDB_API_KEY"] = "4560c294051a3d1f5a575d4d33347931c18dbfb5"

training_args = TrainingArguments(
    output_dir="training_output",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    report_to="wandb",  # enable logging to W&B
)

dataset = load_dataset("rotten_tomatoes") 
dataset = dataset.map(tokenize_dataset, batched=True) # use this map function
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# combine all 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)  # doctest: +SKIP


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 8530/8530 [00:00<00:00, 27576.64 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 21821.66 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 21286.93 examples/s]


In [5]:
# You can customize the training loop behavior by subclassing the methods inside Trainer. 
# This allows you to customize features such as the loss function, optimizer, and scheduler. 
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnexa4ai[0m ([33mnexaai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.4507
1000,0.3932
1500,0.2678
2000,0.2822


TrainOutput(global_step=2134, training_loss=0.3402043586595101, metrics={'train_runtime': 60.7028, 'train_samples_per_second': 281.041, 'train_steps_per_second': 35.155, 'total_flos': 195974132394480.0, 'train_loss': 0.3402043586595101, 'epoch': 2.0})

# Fetch models and tokenizers to use offline

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
model = AutoModel.from_pretrained("./your/path/bigscience_t0")

from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./bigscience_t0")

config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")

# Pipeline

In [None]:
from transformers import pipeline

transcriber = pipeline(task="automatic-speech-recognition")
transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")

# Load pretrained instances with an AutoClass
1. Text, use a Tokenizer to convert text into a sequence of tokens  
2. Speech and audio, use a Feature extractor to extract sequential features from audio waveforms and convert them into tensors.  
3. Image inputs use a ImageProcessor to convert images into tensors.  
4. Multimodal inputs, use a Processor to combine a tokenizer and a feature extractor or image processor.  

- Load a pretrained tokenizer.  
- Load a pretrained image processor  
- Load a pretrained feature extractor.  
- Load a pretrained processor.  
- Load a pretrained model.  
- Load a model as a backbone.  

In [6]:
# tokenizer
"""
input_ids are the indices corresponding to each token in the sentence.
attention_mask indicates whether a token should be attended to or not.
token_type_ids identifies which sequence a token belongs to when there is more than one sequence.
"""
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
sequence = "In a hole in the ground there lived a hobbit."
encoded_input = tokenizer(sequence)
print(encoded_input)

{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
# the tokenizer added two special tokens - CLS and SEP (classifier and separator) - to the sentence. 
tokenizer.decode(encoded_input["input_ids"])

'[CLS] in a hole in the ground there lived a hobbit. [SEP]'

### padding token
Padding is a strategy for ensuring tensors are rectangular by adding a special padding token to shorter sentences.


In [9]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
print(encoded_input)

{'input_ids': tensor([[  101,  2021,  2054,  2055,  2117,  6350,  1029,   102,     0,     0,
             0,     0,     0,     0],
        [  101,  2123,  1005,  1056,  2228,  2002,  4282,  2055,  2117,  6350,
          1010, 28315,  1012,   102],
        [  101,  2054,  2055,  5408, 14625,  1029,   102,     0,     0,     0,
             0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}


# Fine-tune a pretrained model

In [None]:
from transformers import load_dataset, AutoTokenizer

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
