In [2]:
#!pip install sentencepiece

Collecting sentencepiece
  Downloading https://files.pythonhosted.org/packages/5f/03/6cd0c8340ebcecf45f12540a852aede273263f0c757a4a8cea4042fbf715/sentencepiece-0.1.92-cp37-cp37m-win_amd64.whl (1.2MB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.92


In [1]:
import os
import sentencepiece as spm

In [2]:
DATAFILE = 'E:/Sachin/Learning/AI_Learning/7.NLP/100DaysNLP/100-Days-of-NLP-master/100-Days-of-NLP-master/data/pg16457.txt'
MODELDIR = 'models'


In [4]:
spm.SentencePieceTrainer.train(f'''\
    --model_type=unigram\
    --input={DATAFILE}\
    --model_prefix={MODELDIR}/uni\
    --vocab_size=500''')

In [5]:
sp = spm.SentencePieceProcessor()
sp.load(os.path.join(MODELDIR, 'uni.model'))

True

In [6]:
input_string = "This is a test"

In [8]:
# encode: text => id
#Space is encoded "_"
# by default a space is added at the start of the input sentence
print(sp.encode_as_pieces(input_string))    # ['▁This', '▁is', '▁a', '▁t', 'est']
print(sp.encode_as_ids(input_string))       # [371, 77, 13, 101, 181]

['▁This', '▁is', '▁a', '▁t', 'est']
[371, 77, 13, 101, 181]


In [9]:
# decode: id => text
print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))    # This is a test
print(sp.decode_ids([371, 77, 13, 101, 181]))      

This is a test
This is a test


In [10]:
# returns vocab size
print(f"vocab size: {sp.get_piece_size()}")

vocab size: 500


In [11]:
# id <=> piece conversion
print(f"id 371 to piece: {sp.id_to_piece(371)}")
print(f"Piece ▁This to id: {sp.piece_to_id('▁This')}")

id 371 to piece: ▁This
Piece ▁This to id: 371


### Summary
- This is important since SentencePiece enables the subword process to be reversible.
-  You can encode your test sentence in ID’s or in subword tokens; what you use is up to you.
-  The key is that you can decode either the IDs or the tokens perfectly back into the original sentences,
-  including the original spaces. Previously this was not possible with other tokenizers since they just provided the tokens and it was not clear exactly what encoding scheme was used,
-  e.g. how did they deal with spaces or punctuation? This is a big selling point for SentencePiece.

In [12]:

tokens = ['▁This', '▁is', '▁a', '▁t', 'est']
merged = "".join(tokens).replace('▁', " ").strip()
assert merged == input_string, "Input string and detokenized sentence didn't match"

In [13]:
merged

'This is a test'

In [24]:
sp.tokenize('This is demo')

[371, 77, 94, 21, 9]

In [25]:
sp.tokenize('Sachin Gupta')

[138, 11, 110, 39, 323, 272, 8, 11]

In [31]:
#It will not out of bad error....like other tokenizers
for i in [138, 11, 110, 39, 323, 272, 8, 11]:
    print(sp.decode_ids([i]))

S
a
ch
in
G
up
t
a


In [34]:
for i in sp.tokenize('Banctec Datascience Team Rocks'):
    print(sp.decode_ids([i]))

B
an
c
t
ec
D
at
as
ci
ence
T
e
a
m

R
o
ck
s


In [None]:
T