<a href="https://colab.research.google.com/github/saverin0/llms_workshops_files/blob/main/oxford_llms_workshop_1_Intro_to_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers==4.57.0
!pip install Pillow==11.1.0
!pip install -U sentence-transformers==5.1.1
!pip install datasets==3.2.0
!pip install sentencepiece==0.2.1

In [2]:
import numpy as np
import typing as tp
import transformers
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from scipy.special import softmax
from google.colab import drive
from PIL import Image

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize(
    text: str,
    tokenizer: tp.Union[
        PreTrainedTokenizer, PreTrainedTokenizerFast
    ],  # read more about objects in https://huggingface.co/docs/transformers/main_classes/tokenizer
    max_length: tp.Optional[
        int
    ] = None,  # Choose max length wisely, think how this param relates to pretrained model you may want to use together with given tokenizer
):
    # return tokenizer.encode(text=text, max_length=max_length)
    return tokenizer(text=text, max_length=max_length)


sample_text = "Last year I visited London and I liked it a lot."

input_ids = tokenize(sample_text, tokenizer, max_length=100)
print(input_ids)
# test your function with a sentence above, choose tokenizer from any model you want

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': [101, 2197, 2095, 1045, 4716, 2414, 1998, 1045, 4669, 2009, 1037, 2843, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [4]:
def detokenize(
    encoded_text: str, tokenizer: tp.Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
) -> tp.List[str]:
    return tokenizer.decode(token_ids=encoded_text, skip_special_tokens=True)


tokens = detokenize(input_ids["input_ids"], tokenizer)
print(tokens)

last year i visited london and i liked it a lot.


### 3) Find a model and write a code to translate the sentence from English to German with the given model.

In [7]:
%%capture
!pip install sacremoses

In [8]:
model_name = "Helsinki-NLP/opus-mt-en-de"


def translate_text(text: str, model_name: str) -> str:
    model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    input_ids = tokenizer(input, return_tensors="pt")
    outputs = model.generate(**input_ids)
    decoded = detokenize(outputs[0], tokenizer)
    print(decoded)


input = "London is the capital of Great Britain"

translate_text(input, model_name)

London ist die Hauptstadt Großbritanniens


### 4) Understanding model outputs.

In [9]:
model_name = "j-hartmann/emotion-english-distilroberta-base"

model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
config = transformers.AutoConfig.from_pretrained(model_name)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [10]:
config

RobertaConfig {
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "anger",
    "1": "disgust",
    "2": "fear",
    "3": "joy",
    "4": "neutral",
    "5": "sadness",
    "6": "surprise"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "anger": 0,
    "disgust": 1,
    "fear": 2,
    "joy": 3,
    "neutral": 4,
    "sadness": 5,
    "surprise": 6
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "transformers_version": "4.57.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size"

In [11]:
def get_most_and_least_probable_labels(
    text: str,
    tokenizer: tp.Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    model: transformers.PreTrainedModel,
    config: dict,
) -> dict:
    encoded = tokenizer(text, return_tensors="pt")
    outputs = model(**encoded)

    scores = outputs.logits[0].detach().numpy()
    scores = softmax(scores)

    predicted_id = np.argmax(scores)
    predicted_emotion = config.id2label[predicted_id]

    id_with_least_proba = np.argmin(scores)
    least_predicted_emotion = config.id2label[id_with_least_proba]
    return {
        "most_probable_emotion": predicted_emotion,
        "least probable emotion": least_predicted_emotion,
    }


input = "This was the worst experience ever!!!"

outputs = get_most_and_least_probable_labels(input, tokenizer, model, config)

print(outputs)

{'most_probable_emotion': 'disgust', 'least probable emotion': 'joy'}


### 5) Is anything wrong with the logic below?

In [12]:
model_name = "bert-base-uncased"
tokenizer_name = "bert-base-cased"

model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# model and tokenizer should be the same. If a model was trained with uncased tokenizer, but during inference you will use cased tokenizer, there  will be poor results. Consider using same tokenizer that was used during model training.

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
