In [3]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)
for r in result:
    print(f"label: {r['label']}, with score: {round(r['score'], 7)}")

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


label: POSITIVE, with score: 0.9598051
label: NEGATIVE, with score: 0.9994559


In [4]:
# 接下来是复刻管道的多个步骤，按照三个步骤分开执行
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
    "glad to see you",
    "i love you",
    "glad to here that",
    "今天天气好热啊，受不了了"
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
for key, value in inputs.items():
    print(f"{key}: {value}")
# print(inputs)

input_ids: tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  5580,  2000,  2156,  2017,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  1045,  2293,  2017,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  5580,  2000,  2182,  2008,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,   100,  1811,  1811,   100,   100,   100,   100,  1989,   100,
          1744,   100,   100,   102,     0,     0]])
attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 

In [6]:
# explore model
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

my_model_outputs = model(**inputs)
print(my_model_outputs.last_hidden_state.shape)


torch.Size([6, 16, 768])


In [7]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)  # (batch_size, sequence_length, hidden_size)
print(outputs.logits)
print(outputs.loss)

torch.Size([6, 2])
tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464],
        [-4.1307,  4.5048],
        [-4.2756,  4.6393],
        [-3.8654,  4.1883],
        [ 1.1069, -0.9462]], grad_fn=<AddmmBackward0>)
None


In [8]:
import torch
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)
print(model.config.id2label)
for i, pred in enumerate(predictions):
    if pred[0] > pred[1]:
        print(f"Sentence {i} is negative with score {pred[0]:.4f}")
    else:
        print(f"Sentence {i} is positive with score {pred[1]:.4f}")

tensor([[4.0195e-02, 9.5981e-01],
        [9.9946e-01, 5.4418e-04],
        [1.7765e-04, 9.9982e-01],
        [1.3436e-04, 9.9987e-01],
        [3.1781e-04, 9.9968e-01],
        [8.8626e-01, 1.1374e-01]], grad_fn=<SoftmaxBackward0>)
{0: 'NEGATIVE', 1: 'POSITIVE'}
Sentence 0 is positive with score 0.9598
Sentence 1 is negative with score 0.9995
Sentence 2 is positive with score 0.9998
Sentence 3 is positive with score 0.9999
Sentence 4 is positive with score 0.9997
Sentence 5 is negative with score 0.8863


In [9]:
# use model
from transformers import BertConfig, BertModel

# 初始化 Config 类
config = BertConfig()
print(config)
# 如果想要加载预训练模型，可以使用 `from_pretrained` 方法
# config = BertConfig.from_pretrained("bert-base-uncased")
# 这里使用默认的配置参数初始化 BertModel
# model = BertModel(config)
model = BertModel.from_pretrained("bert-base-uncased") # 加载预训练模型
model.save_pretrained("my_bert_model")  # 保存模型到本地


BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.53.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [10]:
## begin tokenizer
text = "Jim Henson was a puppeteer"
tokenized_text = text.split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']


In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [12]:
result = tokenizer("hello world",padding='max_length', max_length=10, return_tensors="pt")
for key, value in result.items():
    print(f"{key}: {value}")

input_ids: tensor([[  101, 19082,  1362,   102,     0,     0,     0,     0,     0,     0]])
token_type_ids: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask: tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])


In [14]:
# 单独tokenize
sequence = "Using a Transformer network is simple"
token = tokenizer.tokenize(sequence)  # 分词
print(token)  # ['Using', 'a', 'Transformer', 'network', 'is', 'simple'
ids = tokenizer.convert_tokens_to_ids(token)  # 转换为ID
print(ids)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [15]:
decoded_str = tokenizer.decode(ids)  # 转换回token
print(decoded_str)  # Using a Transformer network is simple

Using a Transformer network is simple
