In [9]:
import sentencepiece as spm

In [3]:
sp = spm.SentencePieceProcessor("models_sp/test_model.model")

In [4]:
sp.encode("hello world this is salim")

[39, 88, 21, 887, 65, 47, 51, 73, 437]

In [5]:
sp.encode(["hello this is salim", "sp is cool"])

[[39, 88, 21, 65, 47, 51, 73, 437], [273, 47, 134, 21, 31]]

In [6]:
sp.encode(["hello this is salim", "sp is cool"], out_type=int)

[[39, 88, 21, 65, 47, 51, 73, 437], [273, 47, 134, 21, 31]]

In [7]:
sp.encode(["hello this is salim", "sp is cool"], out_type=str)

[['▁he', 'll', 'o', '▁this', '▁is', '▁s', 'al', 'im'],
 ['▁sp', '▁is', '▁co', 'o', 'l']]

In [9]:
sp.encode_as_pieces(["hello this is salim", "sp is cool"])

[['▁he', 'll', 'o', '▁this', '▁is', '▁s', 'al', 'im'],
 ['▁sp', '▁is', '▁co', 'o', 'l']]

In [10]:
sp.decode([39, 88, 21, 65, 47, 51, 73, 437])

'hello this is salim'

In [11]:
sp.decode([['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']])

['This is a test', 'Hello world']

In [12]:
sp.get_piece_size()

1000

In [16]:
sp.IdToPiece(5)

','

In [20]:
sp.id_to_piece([0, 1, 2, 3, 4])

['<unk>', '<s>', '</s>', '\r', '▁']

In [19]:
sp.PieceToId('▁')

4

UsageError: Line magic function `%mk` not found.


### Training

In [22]:
spm.SentencePieceTrainer.train(input="models_sp/botchan.txt", model_prefix="m", vocab_size=1000, user_defined_symbols=['foo', 'bar'])

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: models_sp/botchan.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: foo
  user_defined_symbols: bar
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_di

In [23]:
model = spm.SentencePieceProcessor("m.model")

In [24]:
model.encode("foo")

[8, 3]

In [26]:
model.encode_as_ids("foo")

[8, 3]

In [27]:
model.encode("foo", out_type=str)

['▁', 'foo']

In [28]:
model.encode("foo bar", out_type=str)

['▁', 'foo', '▁', 'bar']

In [29]:
model.encode("foo bar hello world bar", out_type=str)

['▁', 'foo', '▁', 'bar', '▁he', 'll', 'o', '▁world', '▁', 'bar']

## Add new tokens

In [2]:
from sentencepiece import sentencepiece_model_pb2 as model

In [3]:
m = model.ModelProto()
m.ParseFromString(open("m.model", "rb").read())


253326

In [4]:
special_tokens = ['[UNK]',
 '[PAD]',
 '[CLS]',
 '[SEP]',
 '[MASK]',
 '[EOS]',
 '[DOMAIN]',
 '[SLOT]',
 '[ACTION]']

In [5]:
for token in special_tokens:
    new_token = model.ModelProto().SentencePiece()
    new_token.piece = token
    new_token.score = 0
    m.pieces.append(new_token)

In [6]:
with open('new.model', 'wb') as f:
    f.write(m.SerializeToString())

In [7]:
model.encode("[SEP] foo bar hello")

AttributeError: module 'sentencepiece.sentencepiece_model_pb2' has no attribute 'encode'

In [10]:
model = spm.SentencePieceProcessor("new.model")

In [11]:
model.encode("[SEP] foo bar hello")

[8, 1003, 8, 3, 8, 4, 41, 86, 21]

In [12]:
model.encode("[SEP] foo bar hello", out_type=str)

['▁', '[SEP]', '▁', 'foo', '▁', 'bar', '▁he', 'll', 'o']

In [13]:
model = spm.SentencePieceProcessor("m.model")

In [14]:
model.encode("[SEP] foo bar hello", out_type=str)

['▁[', 'S', 'E', 'P', ']', '▁', 'foo', '▁', 'bar', '▁he', 'll', 'o']