Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: get vocab from tokenizer is better. #31

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
49 changes: 29 additions & 20 deletions models/convert-to-ggml.py
@@ -1,9 +1,9 @@
import sys
import struct
import json
import torch
import numpy as np
import struct
import sys

import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer

if len(sys.argv) < 3:
Expand All @@ -22,8 +22,6 @@
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)

with open(dir_model + "/vocab.txt", "r", encoding="utf-8") as f:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vocab.txt may not exist.

vocab = f.readlines()
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
Expand All @@ -42,9 +40,9 @@

tokenizer = AutoTokenizer.from_pretrained(dir_model)
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True)
print (model)
print(model)

print(tokenizer.encode('I believe the meaning of life is'))
print(tokenizer.encode("I believe the meaning of life is"))

list_vars = model.state_dict()
for name in list_vars.keys():
Expand All @@ -54,7 +52,7 @@

print(hparams)

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", 0x67676D6C)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["max_position_embeddings"]))
fout.write(struct.pack("i", hparams["hidden_size"]))
Expand All @@ -63,35 +61,46 @@
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", ftype))

for i in range(hparams["vocab_size"]):
text = vocab[i][:-1] # strips newline at the end
#print(f"{i}:{text}")
data = bytes(text, 'utf-8')
vocab_list = []

# print(tokenizer.get_vocab())
vocab = tokenizer.get_vocab()
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tokenizer has a good implement to get vocab.

if not isinstance(vocab, dict):
raise TypeError

# id:key
reversed_vocab = {idx: key for key, idx in vocab.items()}

# use vocab_size to confirm size
for idx in range(hparams["vocab_size"]):
text = reversed_vocab[idx]
# print(f"{i}:{text}")
data = bytes(text, "utf-8")
fout.write(struct.pack("i", len(data)))
fout.write(data)

for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()
if name in ['embeddings.position_ids', 'pooler.dense.weight', 'pooler.dense.bias']:
if name in ["embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"]:
continue
print("Processing variable: " + name + " with shape: ", data.shape)

n_dims = len(data.shape);
n_dims = len(data.shape)

# ftype == 0 -> float32, ftype == 1 -> float16
if ftype == 1 and name[-7:] == ".weight" and n_dims == 2:
print(" Converting to float16")
data = data.astype(np.float16)
l_type = 1
print(" Converting to float16")
data = data.astype(np.float16)
l_type = 1
else:
l_type = 0

# header
str = name.encode('utf-8')
str = name.encode("utf-8")
fout.write(struct.pack("iii", n_dims, len(str), l_type))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str);
fout.write(str)

# data
data.tofile(fout)
Expand Down