# Augment tokenizer

In [5]:
from copy import deepcopy
from sentencepiece import sentencepiece_model_pb2 as model
from sentencepiece import SentencePieceProcessor

def augment_tokenizer(in_tok_path, fun_names, out_tok_path):

    mp = model.ModelProto()
    mp.ParseFromString(open(in_tok_path, 'rb').read())

    # mp.ParseFromString(open(model_file, 'rb').read())

    print(f'Original model pieces: {len(mp.pieces)}')

    for i, sym in enumerate(fun_names, 1):
        new_sym = mp.SentencePiece()
        new_sym.piece = sym 
        new_sym.score = 0.0 # default score for USER_DEFINED
        new_sym.type = 4 # type value for USER_DEFINED
        mp.pieces.insert(2+i, new_sym) # position after default control symbols ("<unk>", "<s>", "</s>")
        print(f'\tadded {new_sym.piece} ...')

    print(f'New model pieces: {len(mp.pieces)}')
    
    with open(out_tok_path, 'wb') as f:
        f.write(mp.SerializeToString())
    
in_tok_path = '/home/karypisg/romer333/projects/LLM-tools/models/llama_checkpoints/tokenizer.model'
out_tok_path = '/home/karypisg/romer333/projects/LLM-tools/ToolkenGPT/augmented_llama_tokenizer.model'
fun_names = ['<my-cool-function>', '<endtitle>', '<name>', '<url>', '<digit>', '<email>', '<loc>', '<greeting>', '<salutation>']

augment_tokenizer(in_tok_path, fun_names, out_tok_path)

# Test
sp_model = SentencePieceProcessor(model_file=out_tok_path)
print(sp_model.encode_as_pieces("This is a test calling <digit>(lknknx) see what happens/"))
print(sp_model.encode_as_pieces("This is a test calling <my-cool-function>(lknknx) see what happens/"))

Original model pieces: 32000
	added <my-cool-function> ...
	added <endtitle> ...
	added <name> ...
	added <url> ...
	added <digit> ...
	added <email> ...
	added <loc> ...
	added <greeting> ...
	added <salutation> ...
New model pieces: 32009
['▁This', '▁is', '▁a', '▁test', '▁calling', '▁', '<digit>', '(', 'l', 'kn', 'kn', 'x', ')', '▁see', '▁what', '▁happens', '/']
['▁This', '▁is', '▁a', '▁test', '▁calling', '▁', '<my-cool-function>', '(', 'l', 'kn', 'kn', 'x', ')', '▁see', '▁what', '▁happens', '/']


# Reformat for cross-data

In [9]:
# reformat kamel data in the same format as funcqa without functions
import json
kamel_filepath = './data/kamel/test_first_20.json'
funcqa_filepath = './data/funcqa/funcqa_oh.json'

with open(kamel_filepath, 'rb') as fp:
    kamel_data = json.load(fp)
    
with open(funcqa_filepath, 'rb') as fp:
    funcqa_data = json.load(fp)

In [11]:
funcqa_data[0]

{'question': 'A car depreciates by 11% each year. In 8 years, what is the value of the car of its original price? (in decimal form)',
 'answer': 0.3936,
 'func': '<power>(0.89, 8)=0.3936'}

In [12]:
kamel_data[0]

{'question': 'How many floors above the ground has Claridge Icon?',
 'answer': [{'rdf': None, 'alternative': ['45'], 'chosen': '45'}],
 'api': 'P1101'}

In [13]:
for record in kamel_data:
    record['answer'] = record['answer'][0]['chosen']
    record['func'] = ''
    del record['api']

In [16]:
with open('./data/kamel/reformat_test_first_20.json', 'w') as fp:
    json.dump(kamel_data, fp)

-----------

-----------

-----------

In [None]:



import sentencepiece as spm
# vocab_file = 'sentence.bpe.model'
vocab_file = '/home/karypisg/romer333/projects/LLM-tools/models/llama_checkpoints/tokenizer.model'
test_string = '▁de'

sp = spm.SentencePieceProcessor()
sp.load(vocab_file)
new_vocab = ['<s>', '▁de', '-']


print('encoded:', sp.encode_as_ids(test_string)) # 7
original_vocab = [sp.IdToPiece(id) for id in range(0, sp.GetPieceSize())]
sp.set_vocabulary(new_vocab)
v2 = [sp.IdToPiece(id) for id in range(0, sp.GetPieceSize())]

# with open(model_path, 'rb') as fp:
#     data = fp.read()
    
# data

encoded: [29871, 316]


In [6]:
# my_new_model_path = "./my_extended_tokenizer.model"

# sp_model.ResetVocabulary
# sp_model.SetVocabulary([sp.IdToPiece(id) for id in range(0, sp.GetPieceSize())])

['▁This',
 '▁is',
 '▁a',
 '▁test',
 '▁calling',
 '▁',
 '<my-cool-function>',
 '(',
 'l',
 'kn',
 'kn',
 'x',
 ')',
 '▁see',
 '▁what',
 '▁happens',
 '/']

In [12]:
len(original_vocab), len(v2), print('encoded:', sp.encode_as_ids(test_string)) 

encoded: [29871, 316]


(32000, 32000, None)

In [13]:
from sentencepiece import sentencepiece_model_pb2 as model
m = model.ModelProto()
m.ParseFromString(open(vocab_file, 'rb').read())


499723

In [15]:
m.pieces[0]

piece: "<unk>"
score: 0.0
type: UNKNOWN

In [26]:
?? m.pieces.insert

[0;31mSignature:[0m  [0mm[0m[0;34m.[0m[0mpieces[0m[0;34m.[0m[0minsert[0m[0;34m([0m[0mkey[0m[0;34m:[0m [0mint[0m[0;34m,[0m [0mvalue[0m[0;34m:[0m [0;34m~[0m[0m_T[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
  [0;32mdef[0m [0minsert[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mkey[0m[0;34m:[0m [0mint[0m[0;34m,[0m [0mvalue[0m[0;34m:[0m [0m_T[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Inserts the item at the specified position by copying."""[0m[0;34m[0m
[0;34m[0m    [0mnew_element[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_message_descriptor[0m[0;34m.[0m[0m_concrete_class[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mnew_element[0m[0;34m.[0m[0m_SetListener[0m[0;34m([0m[0mself[0m[0;34m.[0m[0m_message_listener[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mnew_element[0m[0;34m.[0m[0mCopyFrom[0m[0;34m([0m[0mva

In [138]:
from copy import deepcopy
from sentencepiece import sentencepiece_model_pb2 as model
from sentencepiece import SentencePieceProcessor

def new_piece_by_deepcopy(original_piece,token:str,score:float):
    '''
    Args:
        original_piece:(SentencePiece) the target of deepcopy
        piece:(str) token
        score:(float) priority of encoding to this token (see spm.vocab). 
        piece_type:(int) 1:normal, 2:<unk>, 3:control, 4:user defined, 5:unused. 
        
    Return:
        a SentencePiece with given piece, score and piece_type
    '''
    new_p=deepcopy(original_piece)# not a good way, but it does work.
    new_p.piece=token
    new_p.score=score
    return new_p

# m.pieces.insert(0, new_piece_by_deepcopy(m.pieces[0],"<my_cool_tool>",0,m.pieces[0].type))

vocab_file = '/home/karypisg/romer333/projects/LLM-tools/models/llama_checkpoints/tokenizer.model'

m = model.ModelProto()
m.ParseFromString(open(vocab_file, 'rb').read())

print("Original vocab size:",len(m.pieces))

new_piece = new_piece_by_deepcopy(m.pieces[1800],"<my_cool_tool>",1e50)
m.pieces.insert(0, new_piece)
# m.

print("Extended vocab size:",len(m.pieces))
# m.pieces[32000]

with open("./my_extended_tokenizer.model","wb") as f:
    f.write(m.SerializeToString())

my_new_model_path = "./my_extended_tokenizer.model"
sp_model = SentencePieceProcessor(model_file=my_new_model_path)
# sp_model.ResetVocabulary
sp_model.SetVocabulary([sp.IdToPiece(id) for id in range(0, sp.GetPieceSize())])


Original vocab size: 32000
Extended vocab size: 32001


True

In [139]:
m.pieces[20]

piece: "<0x10>"
score: 0.0
type: BYTE

In [140]:
sp_model.IdToPiece(3200)

']{'

In [141]:
sp_model.encode_as_pieces(sp_model.IdToPiece(0))

['▁<', 'my', '_', 'co', 'ol', '_', 'tool', '>']

In [60]:
len(m.pieces)

32001

In [61]:
m.pieces[32000]

piece: "<my_cool_tool>"
score: 0.0

In [62]:
with open("./my_extended_tokenizer.model","wb") as f:
    f.write(m.SerializeToString())

In [63]:
from sentencepiece import SentencePieceProcessor
my_new_model_path = "./my_extended_tokenizer.model"
sp_model = SentencePieceProcessor(model_file=my_new_model_path)

In [64]:
len(sp_model)

32001

In [68]:
sp_model.IdToPiece(32000)

'<my_cool_tool>'

In [21]:
import sentencepiece as spm

In [24]:
m.SentencePiece

AttributeError: module 'sentencepiece._sentencepiece' has no attribute 'SentencePiece'

In [8]:
from sentencepiece import sentencepiece_model_pb2

m = sentencepiece_model_pb2.ModelProto()
m.ParseFromString(data)
# model = sentencepiece_model_pb2(model_file=model_path)

499723

In [17]:
type(m.pieces[0])

sentencepiece_model_pb2.SentencePiece

In [18]:
sentencepiece_model_pb2.SentencePiece

AttributeError: module 'sentencepiece.sentencepiece_model_pb2' has no attribute 'SentencePiece'

In [8]:
model.get_piec

<bound method SentencePieceProcessor.GetPieceSize of <sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7f51c809f4b0> >>

In [3]:
! pip install sentencepiece

