# Setup

In [6]:
%load_ext lab_black

<IPython.core.display.Javascript object>

In [9]:
# wrap output text
from IPython.display import HTML, display


def set_css():
    display(
        HTML(
            """
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  """
        )
    )


get_ipython().events.register("pre_run_cell", set_css)

<IPython.core.display.Javascript object>

In [10]:
import torch

# Check if CUDA (GPU support) is available
cuda_available = torch.cuda.is_available()

# Get the PyTorch version
pytorch_version = torch.__version__

# Get information about the GPU if available
gpu_info = None
if cuda_available:
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)
    gpu_info = f"Using GPU: {device_name} (Device {current_device})"

# Print the information
print(f"PyTorch Version: {pytorch_version}")
if cuda_available:
    print("CUDA (GPU) is available.")
    print(gpu_info)
else:
    print("CUDA (GPU) is not available. Running on CPU.")

PyTorch Version: 2.0.1
CUDA (GPU) is not available. Running on CPU.


<IPython.core.display.Javascript object>

In [15]:
import os
import textwrap
import numpy as np

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

# load data

In [18]:
# huggingface_dataset_name = "knkarthick/dialogsum"
# dataset = load_dataset(huggingface_dataset_name)

dataset = load_dataset("samsum")

<IPython.core.display.Javascript object>

# view data


In [19]:
print(f"train num rows : {len(dataset['train'])}")
print(f"validation num rows : {len(dataset['validation'])}")
print(f"test num rows : {len(dataset['test'])}")

train num rows : 14732
validation num rows : 818
test num rows : 819


<IPython.core.display.Javascript object>

In [20]:
print_dashes = lambda: print("-" * 80)
txtwrap = lambda text: textwrap.fill(text, width=80)
print_dialogue = lambda x: [print(txtwrap(text=i)) for i in x.split("\n")]

<IPython.core.display.Javascript object>

In [21]:
view_data_ix = [10, 12]

print_dashes()
for i in view_data_ix:
    print(f"index__{i}")
    print_dashes()
    print(f"DIALOGUE:")
    print_dialogue(dataset["train"][i]["dialogue"])
    print(f"\nSUMMARY:")
    print_dialogue(dataset["train"][i]["summary"])
    print_dashes()

--------------------------------------------------------------------------------
index__10
--------------------------------------------------------------------------------
DIALOGUE:
Lucas: Hey! How was your day?
Demi: Hey there!
Demi: It was pretty fine, actually, thank you!
Demi: I just got promoted! :D
Lucas: Whoa! Great news!
Lucas: Congratulations!
Lucas: Such a success has to be celebrated.
Demi: I agree! :D
Demi: Tonight at Death & Co.?
Lucas: Sure!
Lucas: See you there at 10pm?
Demi: Yeah! See you there! :D

SUMMARY:
Demi got promoted. She will celebrate that with Lucas at Death & Co at 10 pm.
--------------------------------------------------------------------------------
index__12
--------------------------------------------------------------------------------
DIALOGUE:
Anita: I'm at the station in Bologna
Jenny: No problems so far?
Anita: no, everything's going smoothly
Tomy: good!

SUMMARY:
Anita is at Bologna station.
--------------------------------------------------------

<IPython.core.display.Javascript object>

# load model

In [22]:
model_id = "google/flan-t5-base"

# Load tokenizer of FLAN-t5-small
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

Downloading (‚Ä¶)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (‚Ä¶)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (‚Ä¶)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (‚Ä¶)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (‚Ä¶)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

In [23]:
def baseline_summary_predictions(dialogue, summary, tokenizer, model):
    enc = tokenizer.encode(dialogue, return_tensors="pt")
    print_dashes()
    print("DIALOGUE:")
    print_dialogue(dialogue)
    print()
    print("ACTUAL SUMMARY:")
    print(txtwrap(summary))
    print()

    model_out = model.generate(enc, max_new_tokens=100)[0]
    model_summary = tokenizer.decode(model_out, skip_special_tokens=True)

    print("BASELINE MODEL SUMMARY")
    print(txtwrap(model_summary))
    print_dashes()


view_data_ix = [140, 100]


for i in view_data_ix:
    baseline_summary_predictions(
        dataset["train"][i]["dialogue"],
        dataset["train"][i]["summary"],
        tokenizer=tokenizer,
        model=model,
    )

--------------------------------------------------------------------------------
DIALOGUE:
Liz: we're going to Madrid!
Katherine: say whaaaat gurl
Liz: yeah, me and Jerry
Liz: first to Berlin
Liz: we're there for like two nights
Liz: and then we fly to Spain! <file_gif>
Katherine: niceeee
Liz: end of March, beginning of April
Liz: im so exciteeeeed
Katherine: happy for u :)

ACTUAL SUMMARY:
Liz and Jerry will go to Madrid at the end of March. They will spend two nights
in Berlin first.

BASELINE MODEL SUMMARY
Liz and Jerry are going to Madrid. They're going to Berlin for two nights and
then they're going to Spain.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
DIALOGUE:
Gabby: How is you? Settling into the new house OK?
Sandra: Good. The kids and the rest of the menagerie are doing fine. The dogs
absolutely love the new garden. Plenty of room to dig and run around.
Gabby: 

<IPython.core.display.Javascript object>

In [None]:
# dummy_sent = "hello bob how are you doing today, how is the weather hellloo helloo theree how are you"
# dummy_sent_enc = tokenizer.encode(dummy_sent, return_tensors="pt")
# print(dummy_sent_enc)
# print(tokenizer.decode(dummy_sent_enc[0], skip_special_tokens=True))
# model.generate(dummy_sent_enc, max_new_tokens=100)

# Prompts and outputs


## Zero shot inference with instruction

In [24]:
def zero_shot(dialogue, summary, prompt, tokenizer, model):
    print_dashes()
    # print("ORIGNAL DIALOGUE:")
    # print_dialogue(dialogue)
    # print()

    print("PROMPT")
    new_prompt = prompt(dialogue)
    print_dialogue(new_prompt)
    enc = tokenizer.encode(new_prompt, return_tensors="pt")
    print()
    model_out = model.generate(enc, max_new_tokens=100)[0]
    model_summary = tokenizer.decode(model_out, skip_special_tokens=True)

    print("BASELINE MODEL SUMMARY")
    print(txtwrap(model_summary))
    print()
    print("ACTUAL SUMMARY:")
    print(txtwrap(summary))
    print()
    print_dashes()


prompt_map = dict(
    p1=(
        lambda d: f"""
Dialogue -
{d}

Summary - ?
"""
    ),
    p2=lambda d: f"""
Summarize the conversation below:
{d}
""",
    p3=lambda d: f"""
Explain the conversation:
{d}
""",
    p4=lambda d: f"""
Dialogue:
{d}

What is going on in the conversation here?
""",
)


query_ix = [140, 100]


for i in query_ix:
    zero_shot(
        dialogue=dataset["train"][i]["dialogue"],
        summary=dataset["train"][i]["summary"],
        prompt=prompt_map["p4"],
        tokenizer=tokenizer,
        model=model,
    )

--------------------------------------------------------------------------------
PROMPT

Dialogue:
Liz: we're going to Madrid!
Katherine: say whaaaat gurl
Liz: yeah, me and Jerry
Liz: first to Berlin
Liz: we're there for like two nights
Liz: and then we fly to Spain! <file_gif>
Katherine: niceeee
Liz: end of March, beginning of April
Liz: im so exciteeeeed
Katherine: happy for u :)

What is going on in the conversation here?


BASELINE MODEL SUMMARY
Liz and Jerry are going to Madrid. They're going to Berlin for two nights and
then to Spain.

ACTUAL SUMMARY:
Liz and Jerry will go to Madrid at the end of March. They will spend two nights
in Berlin first.

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
PROMPT

Dialogue:
Gabby: How is you? Settling into the new house OK?
Sandra: Good. The kids and the rest of the menagerie are doing fine. The dogs
absolutely love the new garde

<IPython.core.display.Javascript object>

## Few Shot Inference

In [25]:
def few_shot(query, summary, tokenizer, model):
    print_dashes()
    # print("ORIGNAL DIALOGUE:")
    # print_dialogue(dialogue)
    # print()

    print("PROMPT")
    print(query)
    enc = tokenizer.encode(query, return_tensors="pt")
    print()
    model_out = model.generate(
        enc,
        max_new_tokens=200,
    )[0]
    model_summary = tokenizer.decode(model_out, skip_special_tokens=True)

    print("#" * 30)
    print("BASELINE MODEL SUMMARY")
    print(txtwrap(model_summary))
    print()
    print("ACTUAL SUMMARY:")
    print(txtwrap(summary))
    print("#" * 30)
    print()
    print_dashes()


def make_prompt(query, example_ix, dataset):
    p = ""
    for i in example_ix:
        d = dataset["train"][i]["dialogue"]
        s = dataset["train"][i]["summary"]
        p += f"""DIALOGUE:\n{d}\n\nSUMMARY OF DIALOGUE:\n{s}\n\n"""

    p = f"""{p}DIALOGUE:\n{query}\n\nSUMMARY OF DIALOGUE:?\n\n"""
    # print_dialogue(p)
    return p


# make_prompt(dataset['train'][16]['dialogue'],[30], dataset)

<IPython.core.display.Javascript object>

### single shot inference

In [26]:
query_ix = [160, 210, 340]

for i in query_ix:
    q = dataset["train"][i]["dialogue"]
    s = dataset["train"][i]["summary"]
    few_shot_prompt = make_prompt(q, [100], dataset)

    few_shot(
        query=few_shot_prompt,
        summary=s,
        tokenizer=tokenizer,
        model=model,
    )

--------------------------------------------------------------------------------
PROMPT
DIALOGUE:
Gabby: How is you? Settling into the new house OK?
Sandra: Good. The kids and the rest of the menagerie are doing fine. The dogs absolutely love the new garden. Plenty of room to dig and run around.
Gabby: What about the hubby?
Sandra: Well, apart from being his usual grumpy self I guess he's doing OK.
Gabby: :-D yeah sounds about right for Jim.
Sandra: He's a man of few words. No surprises there. Give him a backyard shed and that's the last you'll see of him for months.
Gabby: LOL that describes most men I know.
Sandra: Ain't that the truth! 
Gabby: Sure is. :-) My one might as well move into the garage. Always tinkering and building something in there.
Sandra: Ever wondered what he's doing in there?
Gabby: All the time. But he keeps the place locked.
Sandra: Prolly building a portable teleporter or something. ;-)
Gabby: Or a time machine... LOL
Sandra: Or a new greatly improved Rabbit :-

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


##############################
BASELINE MODEL SUMMARY
Ann Marie is eligible for a reimbursement of the travel costs from her
university. Maria will find the receipt and forward it to Ann Marie.

ACTUAL SUMMARY:
Maria will send Ann Marie the receipt for the airbnb in Boston, because Ann
Marie wants the university to reimburse her travel costs.
##############################

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
PROMPT
DIALOGUE:
Gabby: How is you? Settling into the new house OK?
Sandra: Good. The kids and the rest of the menagerie are doing fine. The dogs absolutely love the new garden. Plenty of room to dig and run around.
Gabby: What about the hubby?
Sandra: Well, apart from being his usual grumpy self I guess he's doing OK.
Gabby: :-D yeah sounds about right for Jim.
Sandra: He's a man of few words. No surprises there. Give him a backyard shed and that's the las

<IPython.core.display.Javascript object>

## two shot

In [27]:
query_ix = [160, 210, 340]

for i in query_ix:
    q = dataset["train"][i]["dialogue"]
    s = dataset["train"][i]["summary"]
    few_shot_prompt = make_prompt(q, [100, 700], dataset)

    few_shot(
        query=few_shot_prompt,
        summary=s,
        tokenizer=tokenizer,
        model=model,
    )

--------------------------------------------------------------------------------
PROMPT
DIALOGUE:
Gabby: How is you? Settling into the new house OK?
Sandra: Good. The kids and the rest of the menagerie are doing fine. The dogs absolutely love the new garden. Plenty of room to dig and run around.
Gabby: What about the hubby?
Sandra: Well, apart from being his usual grumpy self I guess he's doing OK.
Gabby: :-D yeah sounds about right for Jim.
Sandra: He's a man of few words. No surprises there. Give him a backyard shed and that's the last you'll see of him for months.
Gabby: LOL that describes most men I know.
Sandra: Ain't that the truth! 
Gabby: Sure is. :-) My one might as well move into the garage. Always tinkering and building something in there.
Sandra: Ever wondered what he's doing in there?
Gabby: All the time. But he keeps the place locked.
Sandra: Prolly building a portable teleporter or something. ;-)
Gabby: Or a time machine... LOL
Sandra: Or a new greatly improved Rabbit :-

<IPython.core.display.Javascript object>

- We can see that even with two shot inference the results did not improve.
- This model needs further fine tuning if we need to summarize the conversations