In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration,BartTokenizer, BartForConditionalGeneration
from pathlib import Path
import torch

In [3]:
model_path=Path('./models')

In [4]:
tokenizer = T5Tokenizer.from_pretrained(model_path / "flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained(model_path / "flan-t5-base")
print("loading model...")
print(len(tokenizer))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loading model...
32100


In [5]:
print("Embedding matrix shape before resizing:", model.shared.weight.shape)
new_token=["egamma", "pow", "INT+", "INT-", "add", "mul", "z", "t", "s"]
print(len(tokenizer))
tokenizer.add_tokens(new_token)
print(len(tokenizer))

Embedding matrix shape before resizing: torch.Size([32128, 768])
32100
32104


In [6]:
model.resize_token_embeddings(len(tokenizer))
print(model.shared.weight.shape)

torch.Size([32104, 768])


In [7]:
token_ids_to_zero = tokenizer.convert_tokens_to_ids(new_token)
print(f"Token IDs to zero: {token_ids_to_zero}")

Token IDs to zero: [32100, 32101, 32102, 32103, 13039, 4115, 172, 17, 7]


In [8]:
embeddings = model.shared.weight
print("token embedding before modification:\n", embeddings[token_ids_to_zero[:]])
for token_id in token_ids_to_zero:
    with torch.no_grad():
        embeddings[token_id].zero_()
print("New token embedding after modification:\n", embeddings[token_ids_to_zero[:]])

token embedding before modification:
 tensor([[ 1.8652e-01,  8.3984e-01,  3.5156e-01,  ..., -6.1719e-01,
          1.8516e+00, -1.4941e-01],
        [ 1.3550e-02, -4.7656e-01, -7.1484e-01,  ...,  1.4941e-01,
         -4.2969e-02, -4.0039e-01],
        [-1.0547e+00,  1.5391e+00, -1.5000e+00,  ..., -2.3535e-01,
          5.7422e-01, -4.2188e-01],
        ...,
        [ 1.3830e+00, -3.7923e+00, -1.9865e+01,  ..., -1.2344e+01,
          1.2631e+01,  1.1584e-01],
        [ 7.4835e-02, -2.2754e+00, -4.2526e+00,  ...,  4.3933e+00,
          3.1035e+00,  6.2516e-01],
        [-4.8828e+00, -2.9265e+00, -6.1769e+00,  ..., -3.1741e+00,
         -1.6623e+00,  5.9026e-01]], grad_fn=<IndexBackward0>)
New token embedding after modification:
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<

In [9]:
model.save_pretrained(model_path/"flan-t5-base-new")
tokenizer.save_pretrained(model_path/"flan-t5-base-new")

[2024-09-27 13:45:49,945] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


('models/flan-t5-base-new/tokenizer_config.json',
 'models/flan-t5-base-new/special_tokens_map.json',
 'models/flan-t5-base-new/spiece.model',
 'models/flan-t5-base-new/added_tokens.json')

In [10]:
tokenizer(["egamma","mul","add","z","t","apple"], return_tensors="pt",is_split_into_words=True, padding=True, truncation=True)

{'input_ids': tensor([[32100,  4115, 13039,   172,    17,  8947,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [15]:
with open ('./egamma_data/ns=5_nt=1_sets/origin_prefix_dup.txt','r') as fl1:

    count=0
    tot=0
    for line in fl1:
        if len(eval(line))>=1024:
            count+=1
        tot+=len(eval(line))

print(tot/100)
print(count)

969.25
43


In [6]:
with open ('./egamma_data/ns=3_nt=1_sets/origin_prefix_dup.txt','r') as fl1:
    
    count=0
    tot=0
    for line in fl1:
        if len(eval(line))>=1024:
            count+=1
        tot+=len(eval(line))

print(tot/100)
print(count)

626.94
15


In [16]:
with open ('./egamma_data/ns=4_nt=1_sets/origin_prefix_dup.txt','r') as fl1:
    
    count=0
    tot=0
    for line in fl1:
        if len(eval(line))>=1024:
            count+=1
        tot+=len(eval(line))

print(tot/100)
print(count)

731.97
27


In [20]:
from tqdm import tqdm

total_lines = 100

with open('./egamma_data/ns=4_nt=1_sets/origin_prefix_dup.txt', 'r') as f1, open('egamma_data/ns=4_nt=1_sets/simple_prefix_dup.txt', 'r') as f2, \
     open('file1_filtered.txt', 'w') as f1_out, open('file2_filtered.txt', 'w') as f2_out:

    with tqdm(total=total_lines, desc="Processing", unit=" lines") as pbar:
        for line1, line2 in zip(f1, f2):

            if len(eval(line1)) <= 1024:

                f1_out.write(line1)
                f2_out.write(line2)

            pbar.update(1)

print("处理完成，生成新的 file1_filtered.txt 和 file2_filtered.txt")



Processing: 100%|██████████| 100/100 [00:00<00:00, 2390.31 lines/s]

处理完成，生成新的 file1_filtered.txt 和 file2_filtered.txt





In [10]:
with open ('./egamma_data/ns=4_nt=2/origin_prefix.txt','r') as fl1:
    n=0
    count=0
    tot=0
    for line in fl1:
        if len(eval(line))>=1024:
            count+=1
        tot+=len(eval(line))
        n+=1
        if n==100:
            break

print(tot/100)
print(count)

759.67
9


In [9]:
with open ('./egamma_data/ns=5_nt=0_sets/origin_prefix_dup.txt','r') as fl1:
    
    count=0
    tot=0
    for line in fl1:
        if len(eval(line))>=1024:
            count+=1
        tot+=len(eval(line))

print(tot/100)
print(count)

534.73
14
