In [53]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from transformers import GPT2Tokenizer
from tqdm import notebook
from datasets import Dataset

In [54]:
train_dataset = pd.read_csv('/Users/srivatsasinha/Desktop/assignment/anlp-monsoon-24/dataset/cnn_dailymail/train_sampled.csv')
train_dataset.head()

Unnamed: 0.1,Unnamed: 0,id,article,highlights
0,55639,9dbe7a1f26fe037fec2a51676139a2c891614538,"By . Mia De Graaf . PUBLISHED: . 08:10 EST, 24...","Glenda Hull, 21, jumped off a building after s..."
1,244690,c8b0f45685916794fed29ad38cd9fd77af6392de,"(CNN) -- A Lumpkin, Georgia police officer who...",NEW: A city attorney says an officer violated ...
2,158191,58899f01c61608bad234fb1600f85162d6a88c22,(CNN) -- Jose Mourinho likes to keep them gues...,Jose Mourinho says he hasn't decided where he'...
3,69691,c58bea5298d6e751f47948e8d2607bd30a5d01ca,(CNN) -- Novak Djokovic has plenty of reasons ...,Novak Djokovic defeats Stan Wawrinka in straig...
4,8546,181bca3997c4d84bbd6f5da705d5cd7da4a83057,(CNN) -- Dozens of affiliates of the American ...,ACLU: Travel alerts inform people of their rig...


In [8]:
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125m")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [21]:
for x in notebook.tqdm(range(0,10)):
    pass

  0%|          | 0/10 [00:00<?, ?it/s]

In [63]:
def preprocess_function(examples, tokenizer, max_length=1000, set_type='train', padding_token_id=0):
    batch_size = len(examples["article"])  # Use the "article" column for input size
    inputs = [f"{x} Summary : " for x in notebook.tqdm(examples["article"])]  # Format the input with "Article" and "Summary"
    targets = examples["highlights"].to_list()  # Use the "highlights" column for targets

    model_inputs = tokenizer(inputs)  # Tokenize the inputs (articles with "Summary" prompt)
    labels = tokenizer(targets, add_special_tokens=False)  # Tokenize the summaries without special tokens

    for i in notebook.tqdm(range(batch_size)):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]  # Add EOS token at the end of the summary
        
        # Concatenate input (article) with the summary and prepare input IDs for the model
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids  # Mask the article part with -100

        # Set the attention mask for the concatenated sequence
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    for i in notebook.tqdm(range(batch_size)):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        
        # Pad the input and labels to the maximum length with right padding
        input_padding_length = max_length - len(sample_input_ids)
        label_padding_length = max_length - len(label_input_ids)

        # Update input IDs and attention masks with right padding
        model_inputs["input_ids"][i] = sample_input_ids + [padding_token_id] * input_padding_length
        model_inputs["attention_mask"][i] = [1] * len(sample_input_ids) + [0] * input_padding_length
        
        # For labels, keep the summary and pad with -100
        labels["input_ids"][i] = labels["input_ids"][i] + [-100] * label_padding_length

        # Convert to tensors and truncate if necessary
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    
    model_inputs["labels"] = labels["input_ids"]  # Add the labels (tokenized summaries) to the model inputs
    return Dataset.from_dict(model_inputs)


In [64]:
%time model_inputs = preprocess_function(train_dataset,tokenizer,max_length=2048)

  0%|          | 0/80000 [00:00<?, ?it/s]

  0%|          | 0/80000 [00:00<?, ?it/s]

  0%|          | 0/80000 [00:00<?, ?it/s]

CPU times: user 2min 48s, sys: 13.4 s, total: 3min 2s
Wall time: 3min 6s


In [69]:
model_inputs.save_to_disk("/Users/srivatsasinha/Desktop/assignment/anlp-monsoon-24/dataset/cnn_dailymail/hf_test.data")

Saving the dataset (0/5 shards):   0%|          | 0/80000 [00:00<?, ? examples/s]

In [68]:
model_inputs['labels'][0]

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

In [73]:
tokenizer.decode(model_inputs['input_ids'][0])

'By. Mia De Graaf. PUBLISHED:. 08:10 EST, 24 November 2013. |. UPDATED:. 10:13 EST, 24 November 2013. \'Shame\': Glenda Hull \'could not live with the shame\' after being sexually assaulted, brother Roy claims. A young bride-to-be was driven to suicide after being sexually assaulted by a BBC presenter she idolised, her brother claims. Glenda Hull, 21, died in 1971 when she jumped off a 29-storey building \'completely out of the blue\'. It was just weeks after she went to see the BBC star at a conference in Manchester - before going for drinks and accepting a lift home, when she was assaulted in a car. Last night, her brother, Roy Hull, 57, appealed for witnesses to confirm his suspicion about the presenter who is still alive. He claims his sister could not live with the shame following her attack on a high road in the Peak District. He told the Sunday Mirror: \'As a family we have always been concerned that this man was responsible for abusing my sister – an act which led to her taking