### Data load

In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load required model and tokenizer
model_name = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [3]:
%pwd

'/DATA/pranta_2411ai09/DialogueSummarization/src'

In [6]:
!wget -O /DATA/pranta_2411ai09/DialogueSummarization/data/summarizer-data.zip https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip

--2025-09-26 09:57:04--  https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
Resolving github.com (github.com)... 20.207.73.82
Connecting to github.com (github.com)|20.207.73.82|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/entbappy/Branching-tutorial/master/summarizer-data.zip [following]
--2025-09-26 09:57:04--  https://raw.githubusercontent.com/entbappy/Branching-tutorial/master/summarizer-data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7903594 (7.5M) [application/zip]
Saving to: ‘/DATA/pranta_2411ai09/DialogueSummarization/data/summarizer-data.zip’


2025-09-26 09:57:05 (41.7 MB/s) - ‘/DATA/pranta_2411ai09/DialogueSummarization/data/summarizer-data.zip’ 

In [7]:
!unzip /DATA/pranta_2411ai09/DialogueSummarization/data/summarizer-data.zip -d /DATA/pranta_2411ai09/DialogueSummarization/data/

Archive:  /DATA/pranta_2411ai09/DialogueSummarization/data/summarizer-data.zip
  inflating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum-test.csv  
  inflating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum-train.csv  
  inflating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum-validation.csv  
   creating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset/
 extracting: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset/dataset_dict.json  
   creating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset/test/
  inflating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset/test/data-00000-of-00001.arrow  
  inflating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset/test/dataset_info.json  
  inflating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset/test/state.json  
   creating: /DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset/train/
  inflating: /DATA/pranta

In [8]:
from datasets import load_from_disk
dataset_samsum = load_from_disk('/DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset')

In [9]:
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [10]:
split_lengths = [len(dataset_samsum[split])for split in dataset_samsum]

In [11]:
print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][1]["summary"])

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [12]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    

#### Convert raw dataset into the format expected by your model for supervised learning.

In [13]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

Map: 100%|██████████| 14732/14732 [00:02<00:00, 5423.77 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 6606.93 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 6900.04 examples/s]


In [14]:
dataset_samsum_pt["train"]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [15]:
dataset_samsum_pt['train'][1]

{'id': '13728867',
 'dialogue': 'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great',
 'summary': 'Olivia and Olivier are voting for liberals in this election. ',
 'input_ids': [18038,
  151,
  2632,
  127,
  119,
  6228,
  118,
  115,
  136,
  2974,
  152,
  10463,
  151,
  35884,
  130,
  329,
  107,
  18038,
  151,
  2587,
  314,
  1242,
  10463,
  151,
  1509,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [18038, 111, 34296, 127, 6228, 118, 33195, 115, 136, 2974, 107, 1]}

In [16]:
dataset_samsum_pt['test'][1]

{'id': '13729565',
 'dialogue': "Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric: I know! And shows how Americans see Russian ;)\r\nRob: And it's really funny!\r\nEric: I know! I especially like the train part!\r\nRob: Hahaha! No one talks to the machine like that!\r\nEric: Is this his only stand-up?\r\nRob: Idk. I'll check.\r\nEric: Sure.\r\nRob: Turns out no! There are some of his stand-ups on youtube.\r\nEric: Gr8! I'll watch them now!\r\nRob: Me too!\r\nEric: MACHINE!\r\nRob: MACHINE!\r\nEric: TTYL?\r\nRob: Sure :)",
 'summary': 'Eric and Rob are going to watch a stand-up on youtube.',
 'input_ids': [6303,
  151,
  60662,
  147,
  7374,
  151,
  485,
  131,
  116,
  167,
  17050,
  2000,
  147,
  6303,
  151,
  125,
  235,
  147,
  325,
  939,
  199,
  3361,
  236,
  3058,
  26408,
  7374,
  151,
  325,
  126,
  131,
  116,
  288,
  3765,
  147,
  6303,
  151,
  125,
  235,
  147,
  125,
  704,
  172,
  109,
  1976,
  297,
  147,
  7374,
  151,
  110,
  52228,
  147,
  566,
  156,
  

In [17]:
# Save the processed dataset to the data folder
save_path = '/DATA/pranta_2411ai09/DialogueSummarization/data/samsum_pt_dataset'
dataset_samsum_pt.save_to_disk(save_path)

Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 285489.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 89349.61 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 85956.17 examples/s]
