<a href="https://colab.research.google.com/github/saverin0/llms_workshops_files/blob/main/Env_setup_and_intro_oxford_llms_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers==4.57.0
!pip install Pillow==11.1.0
!pip install sentence-transformers==5.1.1
!pip install datasets==3.2.0
!pip install sentencepiece==0.2.1

In [2]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

# 3.1 Encoder only models

### 3.1.1 Sentiment analysis

In [7]:
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
config = transformers.AutoConfig.from_pretrained(model_name)  # read more about configs https://huggingface.co/docs/transformers/main_classes/configuration

In [8]:
sample = "I like learning stuff about neural networks!"

encoded_input = tokenizer(sample, return_tensors="pt")
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

In [9]:
scores

array([0.0196515 , 0.15005878, 0.8302897 ], dtype=float32)

In [10]:
config.id2label[0], config.id2label[1], config.id2label[2]

('negative', 'neutral', 'positive')

### 3.1.2 Sentence Similarity

In [11]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

In [12]:
from sentence_transformers import SentenceTransformer

sample_sentence = ["I am happy with this purchase"]

dissimilar_sample = ["My friend will come soon"]

similar_sample = ["I like what I bought"]

model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
cosine_similarity(model.encode(sample_sentence), model.encode(sample_sentence))

array([[0.99999994]], dtype=float32)

In [15]:
cosine_similarity(model.encode(sample_sentence), model.encode(dissimilar_sample))

array([[0.14736047]], dtype=float32)

In [16]:
cosine_similarity(model.encode(sample_sentence), model.encode(similar_sample))

array([[0.6236479]], dtype=float32)

## 3.2 Encoder-Decoder

### 3.2.1 Summarisation

In [17]:
# This is a dialogue, we will ask a model to summarise its meaning

example = """
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
"""

In [21]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from scipy.special import softmax

def process_example(model_name: str, sample_text: str):
    """
    loads model, tokenises text, applies model to get text completion
    and prints model output
    """
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    input_ids = tokenizer(sample_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("#" * 100)
    print("original text: \n")
    print(sample_text)
    print("#" * 100)
    print(f"Generated by {model_name}: \n")
    print(decoded_output)
    print("#" * 100)

In [22]:
model_name = "facebook/bart-large"

In [23]:
process_example(model_name, example)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

####################################################################################################
original text: 


Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

####################################################################################################
Generated by facebook/bart-large: 

�Hannah: Hey, do you have Betty's number?�Amanda: Lem
####################################################################################################


### 3.2.2 Machine translation

In [24]:
model_name = "google/flan-t5-base"

example = "Translate from English to German: how are you today?"

process_example(model_name, example)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

####################################################################################################
original text: 

Translate from English to German: how are you today?
####################################################################################################
Generated by google/flan-t5-base: 

wie sind Sie heute?
####################################################################################################


## 3.3 Decoder Only models

In [25]:
model_name = "bigscience/bloom-560m"

In [26]:
from transformers import BloomTokenizerFast, BloomForCausalLM

tokenizer = BloomTokenizerFast.from_pretrained(model_name)
model = BloomForCausalLM.from_pretrained(model_name)


text = "Best way to spend weekends is"
encoded_input = tokenizer(text, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [27]:
output = model.generate(**encoded_input)

In [28]:
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

Best way to spend weekends is to go to the beach. The beach is a great place to relax and enjoy the sun. The
