## CNN DAILY/MAIL DATASET

In [6]:
import config
from datasets import load_dataset
import os

os.makedirs("data")

In [9]:
def preprocess_function_cnn_dailymail(
        examples,
        tokenizer,
        max_input_length: int = config.MAX_INPUT_LENGTH,
        max_target_length: int = config.MAX_TARGET_LENGTH,
    ):
        prefix = "summarize: "
        inputs = [prefix + doc for doc in examples["article"]]
        model_inputs = tokenizer(
            inputs, max_length=max_input_length, truncation=True, padding=True
        )

        # Setup the tokenizer for targets
        labels = tokenizer(
            text_target=examples["highlights"],
            max_length=max_target_length,
            truncation=True,
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

# data_dir = "data"

cnn_data_train = load_dataset(
    "cnn_dailymail",'3.0.0', split=f"train[:{config.PERCENT_DATA}%]"
)
cnn_data_test = load_dataset(
    "cnn_dailymail",'3.0.0' , split=f"test[:{config.PERCENT_DATA}%]"
)
cnn_data_val = load_dataset(
    "cnn_dailymail",'3.0.0',split=f"validation[:{config.PERCENT_DATA}%]"
)

Downloading data: 100%|██████████| 257M/257M [00:07<00:00, 32.7MB/s] 
Downloading data: 100%|██████████| 257M/257M [00:06<00:00, 40.6MB/s] 
Downloading data: 100%|██████████| 259M/259M [00:07<00:00, 35.2MB/s] 
Downloading data: 100%|██████████| 34.7M/34.7M [00:01<00:00, 22.7MB/s]
Downloading data: 100%|██████████| 30.0M/30.0M [00:00<00:00, 37.9MB/s]
Generating train split: 100%|██████████| 287113/287113 [00:03<00:00, 81410.03 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 91425.24 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 81273.17 examples/s]


In [11]:
import torch
from transformers import AutoTokenizer
from transformers import T5ForConditionalGeneration

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name, legacy=False)
def preprocess_function_summary(examples,max_input_length:int=config.MAX_INPUT_LENGTH,max_target_length:int=config.MAX_TARGET_LENGTH):
        prefix = "summarize: "
        inputs = [prefix + doc for doc in examples["article"]]
        model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)

        # Setup the tokenizer for targets
        labels = tokenizer(text_target=examples["highlights"], max_length=max_target_length, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

tokenized_datasets_train = cnn_data_train.map(preprocess_function_summary, batched=True,remove_columns=['article','highlights','id'],batch_size=config.TOKENIZE_BATCH_SIZE)
tokenized_datasets_val = cnn_data_val.map(preprocess_function_summary, batched=True,remove_columns=['article','highlights','id'],batch_size=config.TOKENIZE_BATCH_SIZE)
tokenized_datasets_test = cnn_data_test.map(preprocess_function_summary, batched=True,remove_columns=['article','highlights','id'],batch_size=config.TOKENIZE_BATCH_SIZE)

Map: 100%|██████████| 287113/287113 [02:20<00:00, 2039.67 examples/s]
Map: 100%|██████████| 13368/13368 [00:06<00:00, 2051.93 examples/s]
Map: 100%|██████████| 11490/11490 [00:05<00:00, 2004.15 examples/s]


## ARXIV DATASET (TOO BIG PLANNING TO DISCARD)

In [20]:
# from datasets import load_dataset
arxiv_link = "https://github.com/armancohan/long-summarization/tree/master?tab=readme-ov-file"
# dataset = load_dataset("arxiv_dataset",trust_remote_code=True)

## PubMed DATASET

In [23]:
# import gdown
# pubmed_url = "https://archive.org/download/armancohan-long-summarization-paper-code/pubmed-dataset.zip"
# https://huggingface.co/datasets/scientific_papers?row=0
# output = 'pubmed.zip'
# gdown.download(pubmed_url, output, quiet=False)

In [3]:
import os
pubmed_dir = "pubmed-dataset"

for file in os.listdir(pubmed_dir):
    # print(file)
    if file.endswith(".txt"):
        txt_file = os.path.join(pubmed_dir, file)

test.txt
vocab
val.txt
train.txt


In [4]:
from datasets import load_dataset

pubmed_dataset = load_dataset(
    "scientific_papers","pubmed"
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 5.35k/5.35k [00:00<00:00, 21.5MB/s]
Downloading readme: 100%|██████████| 8.27k/8.27k [00:00<00:00, 16.4MB/s]
Downloading data: 100%|██████████| 3.62G/3.62G [01:48<00:00, 33.4MB/s] 
Downloading data: 100%|██████████| 880M/880M [00:22<00:00, 39.6MB/s] 
Generating train split: 100%|██████████| 119924/119924 [00:32<00:00, 3697.11 examples/s]
Generating validation split: 100%|██████████| 6633/6633 [00:02<00:00, 2674.18 examples/s]
Generating test split: 100%|██████████| 6658/6658 [00:01<00:00, 5007.96 examples/s]


In [5]:
pubmed_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6658
    })
})

## MEDIA SUM DATASET (TOO LARGE PLANNING TO DISCARD)

Link to [data](https://aclanthology.org/2021.naacl-main.474/)

## MULTI-NEWS DATASET

In [1]:
from datasets import load_dataset

multi_news_dataset = load_dataset("multi_news")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
multi_news_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

## WMT DATASET

In [4]:
# train_en_link = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en"
# train_de_link = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de"

# test_en_link = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.en"
# test_de_link = "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.de"

wmt_dataset = load_dataset("stas/wmt14-en-de-pre-processed")

In [7]:
wmt_dataset['test']['translation'][0]['en']

'Obama receives Netanyahu'

## TRIVIA-QA DATASET

## FINAL FUNCTIONS

In [None]:

dataset_type = ["cnn_dailymail","pubmed","multi_news","wmt14","triviaqa"]

def preprocess_function(examples,dataset_name:str,max_input_length:int=config.MAX_INPUT_LENGTH,max_target_length:int=config.MAX_TARGET_LENGTH):
        if dataset_name == "cnn_dailymail":
            prefix = "summarize: "
            inputs = [prefix + doc for doc in examples["article"]]
            model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)

            # Setup the tokenizer for targets
            labels = tokenizer(text_target=examples["highlights"], max_length=max_target_length, truncation=True)

            model_inputs["labels"] = labels["input_ids"]
            return model_inputs
        elif dataset_name == "pubmed":
            prefix = "summarize: "
            NotImplementedError()
        elif dataset_name == "multi_news":
            prefix = "summarize: "
            inputs = [prefix + doc for doc in examples["document"]]
            model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)

            # Setup the tokenizer for targets
            labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

            model_inputs["labels"] = labels["input_ids"]
            return model_inputs
        elif dataset_name == "wmt14":
            prefix = "translate german to english: "
            inputs = [prefix + doc for doc in examples["translation"]['en']]
            model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)

            # Setup the tokenizer for targets
            text_targets = [ex['de'] for ex in examples["translation"]]
            labels = tokenizer(text_target=text_targets, max_length=max_target_length, truncation=True)

            model_inputs["labels"] = labels["input_ids"]
            return model_inputs
tokenized_datasets_train = cnn_data_train.map(preprocess_function, batched=True,remove_columns=['article','highlights','id'],batch_size=config.TOKENIZE_BATCH_SIZE)
tokenized_datasets_val = cnn_data_val.map(preprocess_function, batched=True,remove_columns=['article','highlights','id'],batch_size=config.TOKENIZE_BATCH_SIZE)
tokenized_datasets_test = cnn_data_test.map(preprocess_function, batched=True,remove_columns=['article','highlights','id'],batch_size=config.TOKENIZE_BATCH_SIZE)