In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-2b-it")


In [13]:
text_1 = "Original Sceptre 8142026670003C TV Remote Control E165BV(WV)-SS E168BV(WV)-SSE205BV-SMQCCE325BV-SR E328BV(WV)-SRX325BV-FSR X405BV-FSRX505BV-FSRU505CV-UMR U508CV-UMKRU550CV-UM08RU650CV-UMRU750CV-UM"
text_2 = "Metene Pulse Oximeter Fingertip with Batteries and Lanyard, OLED Blood Oxygen Saturation Monitor, 20E"
text_tokens = tokenizer_gemma(text_1)


token_text = tokenizer_gemma.convert_ids_to_tokens(text_tokens.input_ids)
vocab_gemma = list(tokenizer_gemma.vocab)

print(f"Gemma vocab len - {len(vocab_gemma)}")
print(f"Gemma vocab - {vocab_gemma[-10:]}")

print(len(text_tokens.input_ids))
print(token_text)


Gemma vocab len - 256000
Gemma vocab - ['purposes', '▁Nineteen', 'Separ', '▁Lanes', 'leuk', '▁biss', '繹', '▁fortsetter', '▁detailing', '▁Thebes']
126
['<bos>', 'Original', '▁S', 'ceptre', '▁', '8', '1', '4', '2', '0', '2', '6', '6', '7', '0', '0', '0', '3', 'C', '▁TV', '▁Remote', '▁Control', '▁E', '1', '6', '5', 'BV', '(', 'WV', ')-', 'SS', '▁E', '1', '6', '8', 'BV', '(', 'WV', ')-', 'SSE', '2', '0', '5', 'BV', '-', 'SM', 'QC', 'CE', '3', '2', '5', 'BV', '-', 'SR', '▁E', '3', '2', '8', 'BV', '(', 'WV', ')-', 'SR', 'X', '3', '2', '5', 'BV', '-', 'F', 'SR', '▁X', '4', '0', '5', 'BV', '-', 'F', 'SR', 'X', '5', '0', '5', 'BV', '-', 'F', 'SR', 'U', '5', '0', '5', 'CV', '-', 'UM', 'R', '▁U', '5', '0', '8', 'CV', '-', 'UM', 'KR', 'U', '5', '5', '0', 'CV', '-', 'UM', '0', '8', 'RU', '6', '5', '0', 'CV', '-', 'UM', 'RU', '7', '5', '0', 'CV', '-', 'UM']


In [6]:
tokenizer_bge = AutoTokenizer.from_pretrained('BAAI/bge-m3')
bge_tokens = tokenizer_bge.encode(text_1)

bge_token_text = tokenizer_bge.convert_ids_to_tokens(bge_tokens)
print(bge_token_text)
print(text_2.lower())
print(text_1.lower())


['<s>', '▁Original', '▁S', 'cept', 're', '▁81', '420', '26', '670', '003', 'C', '▁TV', '▁Remote', '▁Control', '▁E', '165', 'BV', '(', 'W', 'V', ')', '-', 'SS', '▁E', '168', 'BV', '(', 'W', 'V', ')', '-', 'SSE', '205', 'BV', '-', 'SM', 'Q', 'C', 'CE', '3', '25', 'BV', '-', 'SR', '▁E', '3', '28', 'BV', '(', 'W', 'V', ')', '-', 'SR', 'X', '3', '25', 'BV', '-', 'F', 'SR', '▁X', '40', '5', 'BV', '-', 'F', 'SR', 'X', '50', '5', 'BV', '-', 'FS', 'RU', '50', '5', 'CV', '-', 'UM', 'R', '▁U', '50', '8', 'CV', '-', 'UM', 'K', 'RU', '550', 'CV', '-', 'UM', '08', 'RU', '650', 'CV', '-', 'UM', 'RU', '750', 'CV', '-', 'UM', '</s>']
metene pulse oximeter fingertip with batteries and lanyard, oled blood oxygen saturation monitor, 20e
original sceptre 8142026670003c tv remote control e165bv(wv)-ss e168bv(wv)-sse205bv-smqcce325bv-sr e328bv(wv)-srx325bv-fsr x405bv-fsrx505bv-fsru505cv-umr u508cv-umkru550cv-um08ru650cv-umru750cv-um


In [7]:
import regex as re

# pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

pat_str = "|".join(
    [
        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
        r"""\p{N}{1,3}""",
        r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
        r"""\s*[\r\n]+""",
        r"""\s+(?!\S)""",
        r"""\s+""",
    ]
)

pat = re.compile(pat_str)

re_pattern = re.findall(pat, text_1)
text_1_processed = ' '.join(re_pattern)
bge_tokens = tokenizer_bge.encode(text_1_processed)

bge_token_text = tokenizer_bge.convert_ids_to_tokens(bge_tokens)
print(text_1_processed)
print(bge_token_text)
print(len(bge_tokens))


Original  Sceptre   814 202 667 000 3 C  TV  Remote  Control  E 165 BV (WV )- SS  E 168 BV (WV )- SSE 205 BV -SMQCCE 325 BV -SR  E 328 BV (WV )- SRX 325 BV -FSR  X 405 BV -FSRX 505 BV -FSRU 505 CV -UMR  U 508 CV -UMKRU 550 CV -UM 08 RU 650 CV -UMRU 750 CV -UM
['<s>', '▁Original', '▁S', 'cept', 're', '▁8', '14', '▁202', '▁', '667', '▁000', '▁3', '▁C', '▁TV', '▁Remote', '▁Control', '▁E', '▁165', '▁', 'BV', '▁(', 'W', 'V', '▁)', '-', '▁SS', '▁E', '▁168', '▁', 'BV', '▁(', 'W', 'V', '▁)', '-', '▁', 'SSE', '▁205', '▁', 'BV', '▁-', 'SM', 'Q', 'C', 'CE', '▁325', '▁', 'BV', '▁-', 'SR', '▁E', '▁3', '28', '▁', 'BV', '▁(', 'W', 'V', '▁)', '-', '▁SR', 'X', '▁325', '▁', 'BV', '▁-', 'F', 'SR', '▁X', '▁40', '5', '▁', 'BV', '▁-', 'F', 'SR', 'X', '▁50', '5', '▁', 'BV', '▁-', 'FS', 'RU', '▁50', '5', '▁CV', '▁-', 'UM', 'R', '▁U', '▁50', '8', '▁CV', '▁-', 'UM', 'K', 'RU', '▁550', '▁CV', '▁-', 'UM', '▁08', '▁RU', '▁650', '▁CV', '▁-', 'UM', 'RU', '▁750', '▁CV', '▁-', 'UM', '</s>']
114


In [23]:
from transformers import GPT2Tokenizer

tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("openai-community/gpt2")

gpt2_tokens = tokenizer_gpt2(text_1)

print(gpt2_tokens)
gpt2_token_text = [tokenizer_gpt2._convert_id_to_token(x) for x in gpt2_tokens.input_ids]
print(gpt2_token_text)


{'input_ids': [20556, 311, 984, 260, 807, 1415, 1238, 2075, 3134, 830, 18, 34, 3195, 21520, 6779, 412, 20986, 33, 53, 7, 54, 53, 13219, 5432, 412, 14656, 33, 53, 7, 54, 53, 13219, 50, 5188, 21261, 33, 53, 12, 12310, 48, 4093, 36, 26582, 33, 53, 12, 12562, 412, 34256, 33, 53, 7, 54, 53, 13219, 12562, 55, 26582, 33, 53, 12, 10652, 49, 1395, 26598, 33, 53, 12, 10652, 49, 55, 31654, 33, 53, 12, 10652, 49, 52, 31654, 33538, 12, 5883, 49, 471, 33042, 33538, 12, 5883, 30758, 52, 22730, 33538, 12, 5883, 2919, 49, 52, 17544, 33538, 12, 5883, 49, 52, 15426, 33538, 12, 5883], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['Original', 'ĠS', 'cept', 're', 'Ġ8', '14', '20', '26', '67', '000', '3', 'C', 'ĠTV', 

In [4]:
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")

token_ids = enc.encode(text_2)

print(text_1)

print(len(token_ids))
print([enc.decode([x]) for x in token_ids])

# print(enc.decode(enc.encode(text_1)))


Original Sceptre 8142026670003C TV Remote Control E165BV(WV)-SS E168BV(WV)-SSE205BV-SMQCCE325BV-SR E328BV(WV)-SRX325BV-FSR X405BV-FSRX505BV-FSRU505CV-UMR U508CV-UMKRU550CV-UM08RU650CV-UMRU750CV-UM
27
['Met', 'ene', ' Pulse', ' Ox', 'imeter', ' F', 'ing', 'ert', 'ip', ' with', ' Batter', 'ies', ' and', ' L', 'any', 'ard', ',', ' OLED', ' Blood', ' Oxygen', ' Sat', 'uration', ' Monitor', ',', ' ', '20', 'E']
