## HuggingFace tokenizer 
- Auto\*Tokenizer, Auto\*Model are generic types
- tokenization and model need to match, output of tokenization => input of model
- tokenizer
    - len('input_ids') == len('attention_mask')
    - encode_plus
    - tokenizer(test_sentences) : tokenizer.\_\_call\_\_ :encode
    - tokenizer.encode() == tokenizer.convert_tokens_to_ids() + tokenizer.tokenize()
    - tokenizer.decode(): takes a list argument
    - tokenizer.vocab is a dictionay of token:id pair. encode is to 1. tokenize each word in a sentence 2. convert token to ids, while decode is to convert ids back to token according to vocab.
        -  tokenizer.special_tokens_map records the token:id pair of special characters.
        -  padding=True: use 0 to make up the length
  

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
from transformers import pipeline
task_name = 'sentiment-analysis'
pipeline(task_name)

In [None]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [9]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [11]:
test_sentences = ['today is a good day', 'today is not bad', 'so good']
batch_input = tokenizer(test_sentences, truncation=True, padding=True, return_tensors='pt')

In [12]:
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 1037, 2204, 2154,  102],
        [ 101, 2651, 2003, 2025, 2919,  102,    0],
        [ 101, 2061, 2204,  102,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0, 0, 0]])}

In [14]:
tokenizer.encode(test_sentences[0],)

[101, 2651, 2003, 1037, 2204, 2154, 102]

In [15]:
tokenizer.tokenize(test_sentences[0],)

['today', 'is', 'a', 'good', 'day']

In [20]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_sentences[0]))

[2651, 2003, 1037, 2204, 2154]

In [16]:
model(**batch_input)

SequenceClassifierOutput(loss=None, logits=tensor([[-4.3083,  4.6922],
        [-3.6543,  3.8481],
        [-4.1938,  4.5566]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [23]:
tokenizer.decode([101, 2651, 2003, 1037, 2204, 2154, 102])

'[CLS] today is a good day [SEP]'

In [22]:
tokenizer.vocab

{'vijay': 17027,
 '[unused733]': 738,
 'insane': 9577,
 'burgess': 17754,
 'debuts': 26740,
 '##recht': 28109,
 'hoping': 5327,
 '海': 1902,
 '##yna': 18279,
 'badges': 23433,
 'richly': 26502,
 'henan': 27837,
 'perhaps': 3383,
 '##ular': 7934,
 'lowers': 24950,
 'rise': 4125,
 'fare': 13258,
 'distinctly': 19517,
 '口': 1788,
 'dex': 20647,
 'creature': 6492,
 '[unused222]': 227,
 '##ations': 10708,
 'backseat': 19978,
 'pam': 14089,
 'groaned': 9655,
 'auckland': 8666,
 'fran': 23151,
 '##eros': 27360,
 'crash': 5823,
 'mantle': 16019,
 'bowman': 19298,
 '##icated': 17872,
 'spy': 8645,
 '[unused651]': 656,
 '##pre': 28139,
 'ф': 1199,
 'lil': 13451,
 'leigh': 11797,
 '##logies': 21615,
 'airlines': 7608,
 'pleasant': 8242,
 'station': 2276,
 'haynes': 21805,
 '##umble': 26607,
 'continues': 4247,
 'lancaster': 10237,
 '##gall': 22263,
 '[unused731]': 736,
 '上': 1742,
 'fulton': 17049,
 'translucent': 22897,
 'interview': 4357,
 '##opped': 27288,
 'garry': 21507,
 '##breaker': 21204,


In [25]:
len(tokenizer.vocab), tokenizer.vocab_size

(30522, 30522)

In [34]:
[v for v in tokenizer.special_tokens_map.values()]

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [35]:
tokenizer.convert_tokens_to_ids([v for k, v in tokenizer.special_tokens_map.items()])

[100, 102, 0, 101, 103]

## Model and Analysis

In [36]:
model(**batch_input)

SequenceClassifierOutput(loss=None, logits=tensor([[-4.3083,  4.6922],
        [-3.6543,  3.8481],
        [-4.1938,  4.5566]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [41]:
model.config

DistilBertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "vocab_size": 30522
}

In [57]:
model.config.id2label[0]

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [61]:
import torch
import torch.nn.functional as F

with torch.no_grad(): 
    output = model(**batch_input)
    scores = F.softmax(output.logits, dim=1)
    labels = torch.argmax(scores, dim=1)
    labels = [model.config.id2label[i] for i in labels.tolist()]
    print(scores)
    print(labels)

tensor([[1.2334e-04, 9.9988e-01],
        [5.5147e-04, 9.9945e-01],
        [1.5837e-04, 9.9984e-01]])
['POSITIVE', 'POSITIVE', 'POSITIVE']


## Supplement to Tokenizer
- encode_plus
  - can compare 2 sentences so that token_type_ids is 0 for 1st sentence and 1 for 2nd sentence
- token_type_ids
  - BertTokenizer.from_pretrained('bert-base-uncased') assigns sentences with same token_type_ids: all 0s

In [89]:
from sklearn.datasets import fetch_20newsgroups
from transformers import BertTokenizer

In [90]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [67]:
news_train = fetch_20newsgroups(subset='train')

In [113]:
news_train['data'][:1]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"]

In [66]:
type(news_train.data)

list

In [70]:
len(news_train.data), len(news_train.target)

(11314, 11314)

In [69]:
from collections import Counter
Counter(news_train.target)

Counter({10: 600,
         15: 599,
         8: 598,
         9: 597,
         11: 595,
         7: 594,
         13: 594,
         14: 593,
         5: 593,
         2: 591,
         12: 591,
         3: 590,
         6: 585,
         1: 584,
         4: 578,
         17: 564,
         16: 546,
         0: 480,
         18: 465,
         19: 377})

In [71]:
news_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [73]:
list(news_train)

['data', 'filenames', 'target_names', 'target', 'DESCR']

In [78]:
test_news = news_train.data[:3]
len(test_news)

3

In [85]:
tokenizer(test_news, truncation=True, max_length=32)

{'input_ids': [[101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 1006, 2073, 1005, 1055, 2026, 2518, 1007, 3395, 1024, 2054, 2482, 2003, 2023, 999, 1029, 102], [101, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 1006, 3124, 13970, 2080, 1007, 3395, 1024, 9033, 5119, 8554, 1011, 2345, 2655, 12654, 1024, 2345, 102], [101, 2013, 1024, 1056, 29602, 6856, 1030, 14925, 1012, 14925, 2078, 1012, 19749, 1012, 3968, 2226, 1006, 2726, 1041, 12688, 1007, 3395, 1024, 1052, 2497, 3980, 1012, 1012, 1012, 3029, 1024, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [87]:
tokenizer.encode_plus(text=test_news[0], text_pair=test_news[1], truncation=True, max_length=32)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'input_ids': [101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 102, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [88]:
tokenizer.decode([101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 102, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 102])

'[CLS] from : lerxst @ wam. umd. edu [SEP] from : guykuo @ carson. u. washington. edu [SEP]'

### Inside of Tokenizer
- encode all words/numbers with 30522 vocab?
  - split to vocab available tokens
  - \#\# in front of word indicate joining with the previous word

In [95]:
sent1 = 'The author sold 12763598 copies'
sent2 = 'Children specially created the dragonworrier mostic'

In [96]:
print(tokenizer.tokenize(sent1))
print(tokenizer.tokenize(sent2))

['the', 'author', 'sold', '127', '##6', '##35', '##9', '##8', 'copies']
['children', 'specially', 'created', 'the', 'dragon', '##wo', '##rrier', 'most', '##ic']


In [110]:
tokenizer.vocab
#tokenizer.ids_to_tokens

OrderedDict([('[PAD]', 0),
             ('[unused0]', 1),
             ('[unused1]', 2),
             ('[unused2]', 3),
             ('[unused3]', 4),
             ('[unused4]', 5),
             ('[unused5]', 6),
             ('[unused6]', 7),
             ('[unused7]', 8),
             ('[unused8]', 9),
             ('[unused9]', 10),
             ('[unused10]', 11),
             ('[unused11]', 12),
             ('[unused12]', 13),
             ('[unused13]', 14),
             ('[unused14]', 15),
             ('[unused15]', 16),
             ('[unused16]', 17),
             ('[unused17]', 18),
             ('[unused18]', 19),
             ('[unused19]', 20),
             ('[unused20]', 21),
             ('[unused21]', 22),
             ('[unused22]', 23),
             ('[unused23]', 24),
             ('[unused24]', 25),
             ('[unused25]', 26),
             ('[unused26]', 27),
             ('[unused27]', 28),
             ('[unused28]', 29),
             ('[unused29]', 30),
  

In [106]:
input = tokenizer(sent1) #, return_tensors='pt')
input

{'input_ids': [101, 1996, 3166, 2853, 13029, 2575, 19481, 2683, 2620, 4809, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [107]:
input['input_ids']

[101, 1996, 3166, 2853, 13029, 2575, 19481, 2683, 2620, 4809, 102]

In [108]:
tokenizer.decode(input['input_ids'])

'[CLS] the author sold 12763598 copies [SEP]'

In [109]:
tokenizer.convert_ids_to_tokens(input['input_ids'])

['[CLS]',
 'the',
 'author',
 'sold',
 '127',
 '##6',
 '##35',
 '##9',
 '##8',
 'copies',
 '[SEP]']

In [112]:
cnt_sharp = 0
for token, ids in tokenizer.vocab.items():
    if token.startswith('##'):
        cnt_sharp +=1
print(cnt_sharp)

5828
