In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [4]:
classifier = pipeline('sentiment-analysis', model = model, tokenizer = tokenizer)

Device set to use cpu


In [5]:
res = classifier("I've been waiting for a HuggingFace course my whole life.")

In [6]:
res

[{'label': 'POSITIVE', 'score': 0.9598049521446228}]

In [7]:
text_seq = "Stay humble, eh."

In [None]:
tokenizer(text_seq) ## 0 means attention block should ignore this token

{'input_ids': [101, 2994, 15716, 1010, 15501, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [None]:
## 101 is begining of sentence, 102 is end of sentence

In [9]:
tokenizer.tokenize(text_seq)

['stay', 'humble', ',', 'eh', '.']

In [10]:
tokens = tokenizer.tokenize(text_seq)
ids = tokenizer.convert_tokens_to_ids(tokens)

In [11]:
ids

[2994, 15716, 1010, 15501, 1012]

In [None]:
tokenizer.decode(ids) ## returns a string

'stay humble, eh.'

In [16]:
classifier(text_seq)

[{'label': 'POSITIVE', 'score': 0.9978874325752258}]

In [19]:
tokenizer.tokenize(["Stay humble, you arrogant boy", "Wow, this is horrible"])

['stay',
 'humble',
 ',',
 'you',
 'arrogant',
 'boy',
 'wow',
 ',',
 'this',
 'is',
 'horrible']

In [21]:
classifier(["you arrogant boy", "Wow, this is horrible"])

[{'label': 'NEGATIVE', 'score': 0.9948498606681824},
 {'label': 'NEGATIVE', 'score': 0.998908281326294}]

In [24]:
text_seq = "Lionel is the king"

In [25]:
tokens = tokenizer.tokenize(text_seq)
ids = tokenizer.convert_tokens_to_ids(tokens)

In [26]:
tokenizer.decode(ids)

'lionel is the king'

In [27]:
ids

[14377, 2003, 1996, 2332]

In [32]:
text_seq = "Raahfsasasfda is the king"

In [33]:
tokens = tokenizer.tokenize(text_seq)
ids = tokenizer.convert_tokens_to_ids(tokens)

In [36]:
tokens

['ra', '##ah', '##fs', '##asa', '##sf', '##da', 'is', 'the', 'king']

In [34]:
tokenizer.decode(ids)

'raahfsasasfda is the king'

In [35]:
ids

[10958, 4430, 10343, 16782, 22747, 2850, 2003, 1996, 2332]

In [37]:
import torch
import torch.nn.functional as F

In [38]:
X_trian = ["I've been waiting for a HuggingFace course my whole life.", 
           "Python is great!"]

In [39]:
res = classifier(X_trian)

In [40]:
res

[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'POSITIVE', 'score': 0.9998615980148315}]

In [41]:
batch = tokenizer(X_trian, padding = True, truncation=True, 
                  max_length = 32, return_tensors = 'pt')

In [42]:
batch

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101, 18750,  2003,  2307,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [43]:
with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    predictions = F.softmax(outputs.logits, dim = 1)
    print(predictions)
    labels = torch.argmax(predictions, dim = 1)
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-4.2745,  4.6111]]), hidden_states=None, attentions=None)
tensor([[4.0195e-02, 9.5980e-01],
        [1.3835e-04, 9.9986e-01]])
tensor([1, 1])


In [44]:
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

In [45]:
tok = AutoTokenizer.from_pretrained(save_directory)

In [46]:
mod = AutoTokenizer.from_pretrained(save_directory)

In [47]:
## https://www.youtube.com/watch?v=QEaBAZQCtwE
## 13 min timestamp to understand how to train
## a pytorch model more easily