In [1]:
# !pip install transformers
# !pip install sentence-transformers

# Welcome to our Huggingface Demo!

- This notebook gives some ideas about how we can use Huggingface's transformers library to do some VERY COOL STUFF with VERY LITTLE CODE
- Examples are drawn from Huggingface github here: https://github.com/huggingface/notebooks/blob/master/transformers_doc/task_summary.ipynb

# Sentiment Analysis

In [1]:
from transformers import pipeline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [3]:
demo_text = "Yu Chen is the best teacher ever"
classifier(demo_text)

[{'label': 'POSITIVE', 'score': 0.9998539686203003}]

In [4]:
demo_text = "Yu Chen is not the best teacher ever"
classifier(demo_text)

[{'label': 'NEGATIVE', 'score': 0.9990173578262329}]

In [5]:
demo_text = "Yu Chen is not not the best teacher ever"
classifier(demo_text)

[{'label': 'NEGATIVE', 'score': 0.999593198299408}]

# Sentiment Analysis - Reviews

In [6]:
review_text = "I did not hate anything about this movie!"
classifier(review_text)

[{'label': 'POSITIVE', 'score': 0.9978073239326477}]

In [7]:
review_text = "I did not like one thing about this product!"
classifier(review_text)

[{'label': 'NEGATIVE', 'score': 0.9987735152244568}]

# Paraphrasing

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

In [9]:
## PYTORCH CODE
sequence_0 = "This is a natural language processing class at Marshall Business School"
sequence_1 = "USC has great deep learning classes"
sequence_2 = "Marshall offers a language processing course"
# The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
# the sequence, as well as compute the attention masks.

print('full statement:', sequence_0)

paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase).logits
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]


print('\nprobability "', sequence_1, '" is paraphrase:', paraphrase_results[1])

paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
paraphrase_classification_logits = model(**paraphrase).logits
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]

print('\nprobability "', sequence_2, '" is paraphrase:', paraphrase_results[1])


full statement: This is a natural language processing class at Marshall Business School

probability " USC has great deep learning classes " is paraphrase: 0.056384071707725525

probability " Marshall offers a language processing course " is paraphrase: 0.9214928150177002


# Extractive Question Answering

In [10]:
from transformers import pipeline
question_answerer = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [11]:
context = r"""
Telsa stock soared today after another positive earnings report.  
Elon Musk did some silly stuff on one of his friend's podcasts, but that only seemed to help.
"""

In [12]:
result = question_answerer(question="What did Elon Musk do?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'did some silly stuff', score: 0.3841, start: 78, end: 98


In [13]:
result = question_answerer(question="What happened to Tesla's stock today?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'soared', score: 0.784, start: 13, end: 19


# Next-Word Prediction

In [14]:
## PYTORCH CODE
from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
import torch
from torch import nn
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [15]:
sequence = f"This has been my favorite course so far during my graduate"
inputs = tokenizer(sequence, return_tensors="pt")
input_ids = inputs["input_ids"]
# get logits of last hidden state
next_token_logits = model(**inputs).logits[:, -1, :]
# filter
filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
# sample
probs = nn.functional.softmax(filtered_next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
generated = torch.cat([input_ids, next_token], dim=-1)
resulting_string = tokenizer.decode(generated.tolist()[0])
print(resulting_string)

This has been my favorite course so far during my graduate program


# Text Generation

In [16]:
from transformers import pipeline
text_generator = pipeline("text-generation")

No model was supplied, defaulted to gpt2 (https://huggingface.co/gpt2)


In [17]:
print(text_generator("This has been my favorite class so far", max_length=50, do_sample=False))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "This has been my favorite class so far. I've been doing this for a while now and I'm really excited to get started. I'm really excited to get started with this class. I'm really excited to get started with this class. I"}]


# Summarization

In [18]:
from transformers import pipeline
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


Downloading:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [19]:
ARTICLE = """ Tesla drivers say they have been locked out of their cars after an outage struck the carmaker's app.
Dozens of owners posted on social media about seeing an error message on the mobile app that was preventing them from connecting to their vehicles.
Tesla chief executive Elon Musk personally responded to one complaint from a driver in South Korea, saying on Twitter: "Checking."
Mr Musk later said the app was coming back online.
The Tesla app is used as a key by drivers to unlock and start their cars.
Owners posted a multitude of complaints online about not being able to use their vehicles.
"I'm stuck an hour away from home because I normally use my phone to start [my] car," one owner tweeted.
About 500 users reported an error on the app at around 16:40 ET (21:40 GMT) on Friday, according to the outage tracking site DownDetector. Five hours later, there were just over 60 reports of an error.
"Apologies, we will take measures to ensure this doesn't happen again," Mr Musk tweeted.
The app is not the only way to access the cars though, Stuart Masson, editor of The Car Expert website, told the BBC.
"""

print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))

[{'summary_text': ' Drivers say they have been locked out of their cars after an outage on the Tesla app . The app is used as a key by drivers to unlock and start their cars . Elon Musk later said the app was coming back online .'}]


# Semantic Similarity

In [20]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [21]:
model = SentenceTransformer('stsb-roberta-large')

Downloading:   0%|          | 0.00/748 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

In [22]:
sentence1 = "Apple's earnings were affected by a recent negative outlook in the market for new headphones"
sentence2 = "That apple fell on the floor"
sentence3 = "The market for personal audio devices took a hit last week"
# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)
embedding3 = model.encode(sentence3, convert_to_tensor=True)

In [23]:
# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

Sentence 1: Apple's earnings were affected by a recent negative outlook in the market for new headphones
Sentence 2: That apple fell on the floor
Similarity score: 0.3997766971588135


In [24]:
# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding3)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence3)
print("Similarity score:", cosine_scores.item())

Sentence 1: Apple's earnings were affected by a recent negative outlook in the market for new headphones
Sentence 2: The market for personal audio devices took a hit last week
Similarity score: 0.6564770936965942
