# Modify the tokenizer

In [1]:
from pathlib import Path
basedir = Path("./")
data_path = basedir/"inkuba_instruct_sample.json"

In [27]:
import os
from pathlib import Path
from collections import Counter
import json

import pandas as pd
from transformers import LlamaTokenizer, LlamaTokenizerFast, PreTrainedTokenizerFast
from tokenizers import SentencePieceBPETokenizer
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as sp_model

## A note on reproducibility

The tokenizer is modified by removing tokens that do not appear frequently in the languages that we are working with. The tokenizer was modified by removing tokens that appeared less than 100 times in a previous version of the training dataset. The same methodology applied to a different dataset will produce a different tokenizer. For one, the initial training dataset was smaller, mainly because fewer machine translation examples were included. Thus, a cut-off of 100 tokens on a bigger dataset will produce a bigger vocabulary. This notebook is setup to produce a tokenizer with the same vocabulary size as the one used in model training, although it will not be exactly the same. To produce the exact same tokenizer, set `reproducibility = True` to load the vocab derived from the initial dataset.

In [2]:
reproducibility = True

if reproducibility:
    with open(basedir/"keep_tokens.txt") as f:
        keep_tokens = set(f.read().splitlines())

## Load data

In [3]:
import json

with open(data_path, "r") as f:
    data = json.load(f)
print("Number of entries:", len(data))
data[0]

Number of entries: 350815


{'task': 'sentiment',
 'instruction_orig': 'Changanua mawazo ya matini yanayofuata na uainishe matini hayo katika mojawapo ya lebo zifuatazo. Chanya: iwapo matini yanadokeza mawazo, mtazamo na hali chanya ya kihisia. Hasi: iwapo matini yanadokeza mawazo au hisia hasi. Wastani: iwapo matini hayadokezi lugha chanya au hasi kwa njia ya moja kwa moja au isiyo ya moja kwa moja.',
 'input': 'habari tunaomba radhi kwa usumbufu unaojitokeza tafadhali tuandikie mita namba yako kwenye dm kwa msaada zaidi pj',
 'output': 'Wastani',
 'language': 'swahili',
 'instruction': 'Changanua mawazo ya matini yanayofuata na uainishe matini hayo katika mojawapo ya lebo zifuatazo. Chanya: iwapo matini yanadokeza mawazo, mtazamo na hali chanya ya kihisia. Hasi: iwapo matini yanadokeza mawazo au hisia hasi. Wastani: iwapo matini hayadokezi lugha chanya au hasi kwa njia ya moja kwa moja au isiyo ya moja kwa moja.'}

In [4]:
import os
import pandas as pd

comp_data_dir = basedir/"zindi-inkuba-notebook/Notebooks/data"
comp_data_fns = [f for f in os.listdir(comp_data_dir) if f.endswith(".csv")]
for fn in comp_data_fns:
    df = pd.read_csv(comp_data_dir/fn)[["instruction", "inputs", "targets"]]
    for _, row in df.iterrows():
        data.append({
            "task": "", "instruction": row["instruction"], "input": row["inputs"],
            "output": row["targets"] if fn[:7] == "MTTrain" else ""
            })

In [5]:
BASE_PROMPT = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request."
    "\n\n### Instruction:\n{}"
    "\n\n### Input:\n{}"
    "\n\n### Response:\n{}"
)
# BASE_PROMPT = "Instruction: {} Input: {}, Response: {}"

In [6]:
data = [BASE_PROMPT.format(d["instruction"], d["input"], d["output"]) for d in data]
print(data[0])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Changanua mawazo ya matini yanayofuata na uainishe matini hayo katika mojawapo ya lebo zifuatazo. Chanya: iwapo matini yanadokeza mawazo, mtazamo na hali chanya ya kihisia. Hasi: iwapo matini yanadokeza mawazo au hisia hasi. Wastani: iwapo matini hayadokezi lugha chanya au hasi kwa njia ya moja kwa moja au isiyo ya moja kwa moja.

### Input:
habari tunaomba radhi kwa usumbufu unaojitokeza tafadhali tuandikie mita namba yako kwenye dm kwa msaada zaidi pj

### Response:
Wastani


## Explore token frequency

In [7]:
from transformers import AutoTokenizer

model_name = "lelapa/InkubaLM-0.4B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.special_tokens_map)
print(tokenizer.convert_tokens_to_ids("</s>"))

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}
2


In [8]:
token_counts = {}
for text in data:
    tokens = tokenizer.tokenize(text.strip())
    for t in tokens:
        token_counts[t] = token_counts.get(t, 0) + 1
token_counts = pd.DataFrame({"token": token_counts.keys(), "count": token_counts.values()}).sort_values(by="count", ascending=False)
token_counts

Unnamed: 0,token,count
19,<0x0A>,3236335
8,.,1787554
24,:,1529828
21,#,1060221
20,##,1059347
...,...,...
37349,▁McK,1
37352,▁Even,1
37354,▁Code,1
37355,Coll,1


In [9]:
len(token_counts[token_counts["count"] >= 2])

35170

In [10]:
len(token_counts[token_counts["count"] >= 5])

31182

In [11]:
len(token_counts[token_counts["count"] >= 10])

27536

In [12]:
len(token_counts[token_counts["count"] >= 50])

16396

In [13]:
len(token_counts[token_counts["count"] >= 100])

11527

In [14]:
len(token_counts[token_counts["count"] >= 150])

8992

In [15]:
len(token_counts[token_counts["count"] >= 200])

7383

## Update the tokenizer by only keeping frequencly occurring tokens

We will use the InkubaLM's tokenizer artifacts and just update the vocabulary. Let's see what the tokenizer's files look like?

In [16]:
vocab_file = Path(tokenizer.vocab_file)
# get parent directory
vocab_dir = vocab_file.parent
os.listdir(vocab_dir)


['pytorch_model.bin',
 'tokenizer.json',
 'special_tokens_map.json',
 'config.json',
 'tokenizer_config.json',
 'tokenizer.model',
 'vulavulaslm.py',
 'generation_config.json']

In [17]:
with open(vocab_dir/"tokenizer.json", "r") as f:
    vocab = json.load(f)
vocab

{'version': '1.0',
 'truncation': {'direction': 'Right',
  'max_length': 1024,
  'strategy': 'LongestFirst',
  'stride': 0},
 'padding': None,
 'added_tokens': [{'id': 0,
   'content': '<unk>',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': False,
   'special': True},
  {'id': 1,
   'content': '<s>',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': False,
   'special': True},
  {'id': 2,
   'content': '</s>',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': False,
   'special': True}],
 'normalizer': {'type': 'Sequence',
  'normalizers': [{'type': 'Prepend', 'prepend': '▁'},
   {'type': 'Replace', 'pattern': {'String': ' '}, 'content': '▁'}]},
 'pre_tokenizer': None,
 'post_processor': {'type': 'TemplateProcessing',
  'single': [{'SpecialToken': {'id': '<s>', 'type_id': 0}},
   {'Sequence': {'id': 'A', 'type_id': 0}}],
  'pair': [{'SpecialToken': {'id': '<s>', 'type_id': 0}},
   {'Seque

In [18]:
model_name = "lelapa/InkubaLM-0.4B"
tokenizer_old = AutoTokenizer.from_pretrained(model_name)
min_token_freq = 183

Get the token frequencies again, but this time keep the indices.

In [19]:
# Get a count of how frequently each token appears in the dataset
token_counts = {}
for text in data:
    tokens = tokenizer_old.encode(text.strip())
    for t in tokens:
        token_counts[t] = token_counts.get(t, 0) + 1
token_ids = sorted(list(token_counts.keys()))

# Covert to a Pandas `DataFrame`:
token_counts = pd.DataFrame(
    {"token": token_counts.keys(), "count": token_counts.values()}
).sort_values(by="count", ascending=False)

Limit the vocabulary to tokens with at least `min_token_freq` occurences

In [21]:
token_ids = sorted(token_counts[token_counts["count"] >= min_token_freq]["token"].tolist())
print("Number of tokens used:", len(token_ids))

Number of tokens used: 7886


Make sure all byte-level tokens are included in the new vocab:

In [22]:
token_ids = sorted(list(set(token_ids + list(range(259)))))
print("Updated number of tokens used:", len(token_ids))

Updated number of tokens used: 8076


Make sure single letter tokens are included:

In [23]:
import string
letters = [l for l in string.ascii_lowercase + string.ascii_uppercase]
letters = letters + ["▁" + l for l in letters]
for l in letters:
    t = tokenizer_old.convert_tokens_to_ids(l)
    if t == 0:
        print(f"Token {l} not found in old tokenizer.")
    else:
        token_ids.append(t)
token_ids = sorted(list(set(token_ids)))
print("Updated number of tokens used:", len(token_ids))

Updated number of tokens used: 8106


Save the old tokenizer's configuration files. We will then load the json files and update them as required for our new tokenizer.

In [None]:
tokenizer_old.save_pretrained("/tmp")

In [25]:
if reproducibility:
    new_vocab = set(keep_tokens)
else:
    new_vocab = {tokenizer_old.convert_ids_to_tokens(i) for i in token_ids}
print("New vocabulary size:", len(new_vocab))

New vocabulary size: 8064


### Update the model using the `sentencepiece` library

Begin by loading the original model:

In [28]:
m = sp_model.ModelProto()
m.ParseFromString(open("/tmp/tokenizer.model", "rb").read())

991189

In [29]:
piece_types = {}
for i, piece in enumerate(m.pieces):
    piece_types[piece.type] = piece_types.get(piece.type, 0) + 1
    if piece.type not in (1, 6):
        print(i, piece.piece, piece.type)
piece_types

0 <unk> 2
1 <s> 3
2 </s> 3


{2: 1, 3: 2, 6: 256, 1: 61529}

In [30]:
assert len(m.pieces) == tokenizer_old.vocab_size

Iterate over `m.pieces` and keep only keep the tokens that are in the new vocab.

In [31]:
import copy
pieces_old = copy.deepcopy(m.pieces)
while len(m.pieces) > 0:
    _  = m.pieces.pop()
for piece in pieces_old:
    if piece.piece in new_vocab or piece.type != 1:
        m.pieces.add().CopyFrom(piece)
print(len(m.pieces))

8064


In [32]:
# Iterate over `m.pieces` and keep only keep the tokens that are in the new vocab.
# Also keep all byte pieces (`piece.type == 6`)
# seen = set()
# while True:
#     if m.pieces[0].piece in seen:
#         break
#     x = m.pieces.pop(0)
#     seen.add(x.piece)
#     if x.piece in new_vocab or x.type == 6:
#         m.pieces.append(x)

Save the updated model:

In [33]:
import shutil

new_tokenizer_dir = basedir/"tokenizer_new"
if os.path.exists(new_tokenizer_dir):
    shutil.rmtree(new_tokenizer_dir)
os.makedirs(new_tokenizer_dir, exist_ok=True)

# Serialize the sentencepiece model
with open(new_tokenizer_dir/'tokenizer.model', 'wb') as f:
    f.write(m.SerializeToString())

These tokenizer files remain the same:

In [34]:
_ = shutil.copy("/tmp/tokenizer_config.json", new_tokenizer_dir)
_ = shutil.copy("/tmp/special_tokens_map.json", new_tokenizer_dir)

Test loading the updated model:

In [35]:
sp = spm.SentencePieceProcessor()
assert sp.load(str(new_tokenizer_dir/'tokenizer.model'))

# Test loading the new tokenizer with `transformers`:
try:
    tokenizer = LlamaTokenizerFast.from_pretrained(new_tokenizer_dir)
except Exception:
    print("Cannot load tokenizer with `transformers`")

In [36]:
sp.encode("Wastani", out_type=str)

['Wa', 'stani']

In [37]:
tokenizer.vocab_size

8064

Update the `tokenizers.json` vocab

In [38]:
with open("/tmp/tokenizer.json", "r") as f:
    tokenizer_json = json.load(f)
tokenizer_json["model"]["vocab"] = {tokenizer.convert_ids_to_tokens(i): i for i in range(tokenizer.vocab_size)}

new_tokens = set(tokenizer_json["model"]["vocab"].keys())
new_merges = []
for m in tokenizer_json["model"]["merges"]:
    if m[0] in new_tokens and m[1] in new_tokens and "".join(m) in new_tokens:
        new_merges.append(m)
tokenizer_json["model"]["merges"] = new_merges

with open(new_tokenizer_dir/"tokenizer.json", "w") as f:
    json.dump(tokenizer_json, f)

Test the updated tokenizer:

In [39]:
tokenizer_new = AutoTokenizer.from_pretrained(new_tokenizer_dir)
tokenizer_new.tokenize("Wastani")

['▁Wa', 'stani']

In [40]:
tokenizer_old.tokenize(data[0])

['▁Below',
 '▁is',
 '▁an',
 '▁instruction',
 '▁that',
 '▁describes',
 '▁a',
 '▁task',
 '.',
 '▁Write',
 '▁a',
 '▁response',
 '▁that',
 '▁appro',
 'p',
 'riat',
 'ely',
 '▁comple',
 'tes',
 '▁the',
 '▁request',
 '.',
 '<0x0A>',
 '<0x0A>',
 '##',
 '#',
 '▁Instru',
 'ction',
 ':',
 '<0x0A>',
 'Ch',
 'ang',
 'anua',
 '▁mawazo',
 '▁ya',
 '▁ma',
 'tini',
 '▁yana',
 'y',
 'of',
 'u',
 'ata',
 '▁na',
 '▁ua',
 'in',
 'ishe',
 '▁ma',
 'tini',
 '▁hayo',
 '▁katika',
 '▁mo',
 'j',
 'aw',
 'ap',
 'o',
 '▁ya',
 '▁lebo',
 '▁z',
 'ifu',
 'ata',
 'zo',
 '.',
 '▁Ch',
 'anya',
 ':',
 '▁iwapo',
 '▁ma',
 'tini',
 '▁yan',
 'ado',
 'keza',
 '▁mawazo',
 ',',
 '▁mtazamo',
 '▁na',
 '▁hali',
 '▁chanya',
 '▁ya',
 '▁k',
 'ih',
 'isia',
 '.',
 '▁H',
 'asi',
 ':',
 '▁iwapo',
 '▁ma',
 'tini',
 '▁yan',
 'ado',
 'keza',
 '▁mawazo',
 '▁au',
 '▁hisia',
 '▁hasi',
 '.',
 '▁Wa',
 'stani',
 ':',
 '▁iwapo',
 '▁ma',
 'tini',
 '▁hay',
 'ado',
 'kezi',
 '▁lugha',
 '▁chanya',
 '▁au',
 '▁hasi',
 '▁kwa',
 '▁njia',
 '▁ya',
 '▁moja',


In [41]:
tokenizer_new.tokenize(data[0])

['▁B',
 'el',
 'ow',
 '▁is',
 '▁an',
 '▁in',
 'st',
 'ru',
 'ction',
 '▁that',
 '▁des',
 'c',
 'ri',
 'bes',
 '▁a',
 '▁task',
 '.',
 '▁W',
 'r',
 'ite',
 '▁a',
 '▁res',
 'p',
 'on',
 'se',
 '▁that',
 '▁appro',
 'p',
 'riat',
 'ely',
 '▁comple',
 'tes',
 '▁the',
 '▁re',
 'qu',
 'est',
 '.',
 '<0x0A>',
 '<0x0A>',
 '##',
 '#',
 '▁In',
 'st',
 'ru',
 'ction',
 ':',
 '<0x0A>',
 'Ch',
 'ang',
 'anua',
 '▁mawazo',
 '▁ya',
 '▁ma',
 'tini',
 '▁yana',
 'y',
 'of',
 'u',
 'ata',
 '▁na',
 '▁ua',
 'in',
 'ishe',
 '▁ma',
 'tini',
 '▁hayo',
 '▁katika',
 '▁mo',
 'j',
 'aw',
 'ap',
 'o',
 '▁ya',
 '▁lebo',
 '▁z',
 'ifu',
 'ata',
 'zo',
 '.',
 '▁Ch',
 'anya',
 ':',
 '▁i',
 'w',
 'ap',
 'o',
 '▁ma',
 'tini',
 '▁yan',
 'ado',
 'keza',
 '▁mawazo',
 ',',
 '▁mta',
 'zam',
 'o',
 '▁na',
 '▁hali',
 '▁chanya',
 '▁ya',
 '▁k',
 'ih',
 'isia',
 '.',
 '▁H',
 'asi',
 ':',
 '▁i',
 'w',
 'ap',
 'o',
 '▁ma',
 'tini',
 '▁yan',
 'ado',
 'keza',
 '▁mawazo',
 '▁au',
 '▁hisia',
 '▁hasi',
 '.',
 '▁Wa',
 'stani',
 ':',
 '▁i',
