In [1]:
from datasets import load_dataset

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds["train"][0]['highlights']

"Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund ."

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_path = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16,
)
tokenizer.padding_side = 'left'

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]


In [3]:
new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]}

In [4]:
tokenizer.add_special_tokens(new_special_tokens)

2

In [5]:
model.resize_token_embeddings(len(tokenizer))

Embedding(151667, 2048)

In [6]:
import nltk
nltk.download('punkt_tab')  # Download the necessary tokenizer data
from nltk.tokenize import sent_tokenize

def split_into_sentences(text):
    sentences = sent_tokenize(text)
    return sentences


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/rbqlsquf2/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [8]:
from tqdm import tqdm

def create_example(all_data):
    all_result = []
    for data in tqdm(all_data):
        data_id = data["id"]
        summary = data["highlights"].replace("\n", " ")
        context = split_into_sentences(data["article"])
        total_sentence_number = 1
        sentences = ""
        for sent in context:
            sentence = "[{}] {}".format(total_sentence_number, sent) + "\n"
            sentences = sentences + sentence
            total_sentence_number += 1

        
        sentence = sentence.rstrip("\n")
        instruction = "<|mrc|>False\n<|summary|>True"
        prompt = (
            "**Document**\n{}".format(sentences)
        )
        response = "**Answer**\n**Summary**\n{}\n".format(summary)
        # response = "**Answer**: {}\n**Supporting Sentences**: {}".format(answer, supporting_sentence)
        messages = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": prompt},
            # {"role": "assistant", "content": response},
        ]
        result = {}
        result["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

        messages = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": response},
        ]
        result["all_text"] = tokenizer.apply_chat_template(messages, tokenize=False)
        result["label"] = response
        all_result.append(result)
        supporting_sentence = ""
        # print(model_answer)
        # if len(all_result) ==1000:
        #     break
    return all_result
    

In [9]:
import json

input_data = create_example(ds['validation'])

100%|██████████| 13368/13368 [00:08<00:00, 1642.50it/s]


In [15]:
print(input_data[0]['label'])

**Answer**
**Summary**
Zully Broussard decided to give a kidney to a stranger . A new computer program helped her donation spur transplants for six kidney patients .



In [10]:
for data in input_data:
    data['label'] = data['label'].replace(" .", ".")

In [17]:
from tqdm import tqdm
all_len = []
all_result = []

for input_data_ in tqdm(input_data):
    text = input_data_["text"]
    if len(tokenizer(text)["input_ids"]) <= 2048:
        # data["text"] = data["text"]
        all_result.append(input_data_)
    # all_len.append(len(tokenizer(text)["input_ids"]))

with open("../data/qwen_cnn_test_data.json", "w", encoding="utf-8") as f:
    json.dump(all_result, f, ensure_ascii=False, indent=4)

100%|██████████| 13368/13368 [00:22<00:00, 582.69it/s]


In [11]:
input_data = create_example(ds['test'])


all_len = []
all_result = []
over_num = 0
for input_data_ in tqdm(input_data):
    text = input_data_["all_text"]
    count = len(tokenizer(text)["input_ids"])
    if count <= 2048:
        all_result.append(input_data_)
    else:
        over_len = count - 2048
        input_data_['text'] = input_data_['text'][:over_len]
        over_num +=1
    # all_len.append(len(tokenizer(text)["input_ids"]))
print(over_num)

100%|██████████| 11490/11490 [00:07<00:00, 1619.94it/s]
100%|██████████| 11490/11490 [00:23<00:00, 498.68it/s]

759





In [13]:
with open("../data/qwen_cnn_test_data.json", "w", encoding="utf-8") as f:
    json.dump(all_result, f, ensure_ascii=False, indent=4)

In [38]:
for result in all_result:
    result["label"] = "assistant\n" + result["label"]
    

In [39]:
with open("data/qwen_dev_data.json", "w", encoding="utf-8") as f:
    json.dump(all_result, f, ensure_ascii=False, indent=4)

In [14]:
from tqdm import tqdm
all_len = []
all_result = []

for data, input_data_ in tqdm(zip(dev_data, input_data)):
    text = input_data_["text"]
    if len(tokenizer(text)["input_ids"]) <= 2048:
        # data["text"] = data["text"]
        all_result.append(data)
    # all_len.append(len(tokenizer(text)["input_ids"]))

7405it [00:17, 426.88it/s]


In [16]:
with open("data/teddst_dev.json", "w", encoding="utf-8") as f:
    json.dump(input_data, f, ensure_ascii=False, indent=4)

In [7]:
print(input_data[0]["label"])

**Answer**: yes
**Supporting Sentences**: [4] Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer.
[17] Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.



In [16]:
count = len(list(filter(lambda x: x < 2048, all_len)))
print(count)

6539
