In [16]:
import transformers
import os, re, json
import torch, numpy
from collections import defaultdict
from tqdm import tqdm
import jsonlines
from dsets import KnownsDataset
from util.globals import DATA_DIR

In [17]:
!huggingface-cli login --token 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

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rseetharaman_umass_edu/.cache/huggingface/token
Login successful


In [18]:
from transformers import AutoTokenizer

phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
llama_tokenizer = AutoTokenizer.from_pretrained("/work/pi_dhruveshpate_umass_edu/rseetharaman_umass_edu/llama-2-7b-hf")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

In [20]:
knowns = KnownsDataset(data_dir=DATA_DIR)  # Dataset of known facts

Loaded dataset with 1209 elements


In [21]:
def is_present_in_vocab(vocab, space_token, attr):
    for v,idx in vocab.items():

        # b'\\u0120' - phi-2's and falcon's space token
        # b'\\u2581' - mistral's and gemma's space token

        # Condition to check if the attribute is an atomic token, first unicode character denotes beginning of word.
        if v[0].encode("unicode_escape")== space_token and v[1:]==attr:
            return (True, idx)
    return (False, -1)

In [22]:
def get_entries_with_atomic_tokens(tokenizer, space_token, knowns):
    data = []
    for i,k in enumerate(tqdm(knowns)):
        is_present, idx = is_present_in_vocab(tokenizer.vocab, space_token, k['attribute'])
        if is_present:
            k.update({"token_id": idx})
            data.append(k)
    return data

In [23]:
def save_records(records, model_name):
    with jsonlines.open(f"data_{model_name}.jsonl", "w") as writer:
        for d in records:
            writer.write(d)

In [24]:
phi_entries = get_entries_with_atomic_tokens(phi_tokenizer, b'\\u0120', knowns)

100%|██████████| 1209/1209 [02:02<00:00,  9.83it/s]


In [32]:
len(phi_entries)

1209

In [25]:
save_records(phi_entries, 'phi')

In [26]:
s="\u2581Rome"
byte=b'\\u2581'
s[0].encode("unicode_escape") == byte

True

In [27]:
mistral_entries = get_entries_with_atomic_tokens(mistral_tokenizer, b'\\u2581', knowns)

100%|██████████| 1209/1209 [01:22<00:00, 14.59it/s]


In [29]:
len(mistral_entries)

922

In [30]:
save_records(mistral_entries, 'mistral')

In [31]:
llama_entries = get_entries_with_atomic_tokens(llama_tokenizer, b'\\u2581', knowns)

100%|██████████| 1209/1209 [01:20<00:00, 14.94it/s]


In [33]:
len(llama_entries)

902

In [34]:
save_records(llama_entries, 'llama')

In [35]:
len(llama_tokenizer.vocab)

32000

In [36]:
len(mistral_tokenizer.vocab)

32000

In [37]:
len(phi_tokenizer.vocab)

50295

In [38]:
len(gemma_tokenizer.vocab)

256000

In [39]:
gemma_entries = get_entries_with_atomic_tokens(gemma_tokenizer, b'\\u2581', knowns)

100%|██████████| 1209/1209 [10:29<00:00,  1.92it/s]


In [40]:
len(gemma_entries)

1208

In [41]:
save_records(gemma_entries, 'gemma')

In [42]:
falcon_entries = get_entries_with_atomic_tokens(falcon_tokenizer, b'\\u0120', knowns)

100%|██████████| 1209/1209 [02:28<00:00,  8.13it/s]


In [43]:
save_records(falcon_entries, 'falcon')

In [44]:
len(falcon_entries)

1201