In [1]:
import json
from transformers import pipeline

# Load test data from test.json
with open("test.json", "r", encoding="utf-8") as f:
    data = [json.loads(line.strip()) for line in f]

# Use only the first 10 dialogues
samples = data[:10]

# Load summarization pipeline using Facebook's BART model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarize and print results
for i, sample in enumerate(samples):
    dialogue = sample["dialogue"]
    summary = summarizer(dialogue, max_length=60, min_length=10, do_sample=False)[0]["summary_text"]
    print(f"\nSample {i+1}:")
    print("Original Dialogue:", dialogue)
    print("Model Summary   :", summary)
    print("Reference Summary:", sample["summary"])


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
  attn_output = torch.nn.functional.scaled_dot_product_attention(



Sample 1:
Original Dialogue: Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
Model Summary   : Hannah asks Amanda for Betty's number. Amanda can't find it. Hannah asks Larry. Amanda asks Larry to call Betty.
Reference Summary: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Sample 2:
Original Dialogue: Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out 

# Using the SAMSum Dataset with Pegasus Model

In [11]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("transformersbook/pegasus-samsum")
model = AutoModelForSeq2SeqLM.from_pretrained("transformersbook/pegasus-samsum")

# Load test data from test.json
with open("test.json", "r", encoding="utf-8") as f:
    data = [json.loads(line.strip()) for line in f]

# Use only the first 10 dialogues
samples = data[:10]

# Generate and print summaries
for i, sample in enumerate(samples):
    dialogue = sample["dialogue"]

    # Tokenize input
    inputs = tokenizer(dialogue, truncation=True, padding="longest", return_tensors="pt")

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=60,
        min_length=10,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Print results
    print(f"\nSample {i+1}:")
    print("Original Dialogue:", dialogue)
    print("Model Summary    :", summary)
    print("Reference Summary:", sample["summary"])



Sample 1:
Original Dialogue: Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
Model Summary    : Amanda can't find Betty's number. Larry called her last time they were at the park together.
Reference Summary: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Sample 2:
Original Dialogue: Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some 

# Using Pipeline for Summarization

In [9]:
from transformers import pipeline
import json

# Load the summarization pipeline for pegasus-samsum
summarizer = pipeline("summarization", model="transformersbook/pegasus-samsum")

# Load the JSON file (test.json)
with open("test.json", "r", encoding="utf-8") as f:
    data = [json.loads(line.strip()) for line in f]

# Take first 10 samples
samples = data[:10]

# Run summarization
for i, sample in enumerate(samples):
    dialogue = sample["dialogue"]
    summary = summarizer(
        dialogue,
        max_new_tokens=60,    # Use max_new_tokens instead of max_length
        min_length=15,
        do_sample=False,
        truncation=True
    )[0]["summary_text"]

    print(f"\nSample {i+1}:")
    print("Original Dialogue:", dialogue)
    print("Model Summary    :", summary)
    print("Reference Summary:", sample["summary"])

Device set to use cuda:0
Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)



Sample 1:
Original Dialogue: Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
Model Summary    : Amanda can't find Betty's number. Larry called her last time they were at the park together.
Reference Summary: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Sample 2:
Original Dialogue: Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some 

Your max_length is set to 128, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)



Sample 7:
Original Dialogue: Max: Know any good sites to buy clothes from?
Payton: Sure :) <file_other> <file_other> <file_other> <file_other> <file_other> <file_other> <file_other>
Max: That's a lot of them!
Payton: Yeah, but they have different things so I usually buy things from 2 or 3 of them.
Max: I'll check them out. Thanks. 
Payton: No problem :)
Max: How about u?
Payton: What about me?
Max: Do u like shopping?
Payton: Yes and no.
Max: How come?
Payton: I like browsing, trying on, looking in the mirror and seeing how I look, but not always buying.
Max: Y not?
Payton: Isn't it obvious? ;)
Max: Sry ;)
Payton: If I bought everything I liked, I'd have nothing left to live on ;)
Max: Same here, but probably different category ;)
Payton: Lol
Max: So what do u usually buy?
Payton: Well, I have 2 things I must struggle to resist!
Max: Which are?
Payton: Clothes, ofc ;)
Max: Right. And the second one?
Payton: Books. I absolutely love reading!
Max: Gr8! What books do u read?
Payton: Ever

Your max_length is set to 128, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)



Sample 9:
Original Dialogue: Beatrice: I am in town, shopping. They have nice scarfs in the shop next to the church. Do you want one?
Leo: No, thanks
Beatrice: But you don't have a scarf.
Leo: Because I don't need it.
Beatrice: Last winter you had a cold all the time. A scarf could help.
Leo: I don't like them.
Beatrice: Actually, I don't care. You will get a scarf.
Leo: How understanding of you!
Beatrice: You were complaining the whole winter that you're going to die. I've had enough.
Leo: Eh.
Model Summary    : Leo doesn't need a scarf because he had a cold all the time last winter. Beatrice will buy him one.
Reference Summary: Beatrice wants to buy Leo a scarf, but he doesn't like scarves. She cares about his health and will buy him a scarf no matter his opinion.


KeyboardInterrupt: 