In [5]:
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    GenerationConfig,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from transformers.generation.utils import EncoderDecoderCache
import evaluate
import torch
import numpy as np
import random
from collections import OrderedDict
from scipy.stats import wilcoxon, ttest_rel


In [7]:
seed = 413
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
inputs = {"raw": "dialogue", "resolved": "resolved_text"}
data = load_from_disk("samsum_new")
model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [10]:
def preprocess(data, input):
    inputs = data[input]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=data["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predictions = np.array(predictions)
    predictions = predictions.astype(np.int64)
    predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)
    predictions = predictions.tolist()
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    if isinstance(labels, tuple):
        labels = labels[0]
    labels = np.array(labels)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = labels.astype(np.int64).tolist()
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}


In [None]:
all_metrics = OrderedDict()

trainer_raw = None
trainer_resolved = None


for version, input_ in inputs.items():
    tokenized_data = data.map(
        lambda x: preprocess(x, input_),
        batched=True,
        remove_columns=data["train"].column_names
    )

    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)
    model.config.use_cache = False
    for name, param in model.named_parameters():
        if any(f"encoder.block.{i}." in name for i in range(3)):
            param.requires_grad = False

    training_args = Seq2SeqTrainingArguments(
        output_dir=f"/content/t5-base-{version}-finetuned",
        eval_strategy="epoch",
        learning_rate=0.0001,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=3,
        predict_with_generate=True,
        fp16=True,
        push_to_hub=False,
        logging_dir=f"/content/logs-{version}",
        logging_strategy="epoch",
        logging_steps=500,
        report_to="none",
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["validation"],
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    if version == "raw":
        trainer_raw = trainer
    else:
        trainer_resolved = trainer
    trainer.train()
    trainer.save_model(f"/content/t5-base-{version}-finetuned")

    all_metrics[version] = trainer.evaluate()


In [13]:
metrics_map = {
    "eval_loss":  "Loss",
    "eval_rouge1": "ROUGE1",
    "eval_rouge2": "ROUGE2",
    "eval_rougeL": "ROUGEL",
}

for version, metrics in all_metrics.items():
    print(f"\nMetrics for {version} input:")
    for key in metrics_map:
        if key in metrics:
            print(f"{metrics_map[key]:<10}: {metrics[key]:.4f}")



Metrics for raw input:
Loss      : 2.8259
ROUGE1    : 27.3721
ROUGE2    : 9.4496
ROUGEL    : 23.9784

Metrics for resolved input:
Loss      : 1.6585
ROUGE1    : 41.3422
ROUGE2    : 18.2445
ROUGEL    : 34.6729


In [None]:
raw_output = trainer_raw.predict(tokenized_data["validation"])
resolved_output = trainer_resolved.predict(tokenized_data["validation"])

raw_ids = raw_output.predictions
if isinstance(raw_ids, tuple):
    raw_ids = raw_ids[0]
resolved_ids = resolved_output.predictions
if isinstance(resolved_ids, tuple):
    resolved_ids = resolved_ids[0]
raw_ids = np.clip(raw_ids, 0, tokenizer.vocab_size - 1).astype(np.int64)
resolved_ids = np.clip(resolved_ids, 0, tokenizer.vocab_size - 1).astype(np.int64)

raw_outputs = tokenizer.batch_decode(raw_ids, skip_special_tokens=True)
resolved_outputs = tokenizer.batch_decode(resolved_ids, skip_special_tokens=True)
labels_ids = np.where(raw_output.label_ids!=-100, raw_output.label_ids, tokenizer.pad_token_id)
refs = tokenizer.batch_decode(labels_ids,skip_special_tokens=True)

rouge = evaluate.load("rouge")
rouge1_raw, rouge2_raw, rougeL_raw = [], [], []
rouge1_resolved, rouge2_resolved, rougeL_resolved = [], [], []
for pred_raw, pred_resolved, ref in zip(raw_outputs, resolved_outputs, refs):
    scores_raw = rouge.compute(predictions=[pred_raw], references=[ref])
    scores_resolved = rouge.compute(predictions=[pred_resolved], references=[ref])
    rouge1_raw.append(scores_raw["rouge1"])
    rouge2_raw.append(scores_raw["rouge2"])
    rougeL_raw.append(scores_raw["rougeL"])
    rouge1_resolved.append(scores_resolved["rouge1"])
    rouge2_resolved.append(scores_resolved["rouge2"])
    rougeL_resolved.append(scores_resolved["rougeL"])
rouge1_raw = np.array(rouge1_raw)
rouge1_resolved = np.array(rouge1_resolved)
rouge2_raw = np.array(rouge2_raw)
rouge2_resolved = np.array(rouge2_resolved)
rougeL_raw = np.array(rougeL_raw)
rougeL_resolved = np.array(rougeL_resolved)


In [22]:
for name, raw_, resolved_ in [("ROUGE-1", rouge1_raw, rouge1_resolved), ("ROUGE-2", rouge2_raw, rouge2_resolved), ("ROUGE-L",rougeL_raw,rougeL_resolved)]:
    wilcoxon_statistic, wilcoxon_pvalue = wilcoxon(resolved_, raw_, alternative="two-sided")
    t_statistic, t_pvalue = ttest_rel(resolved_, raw_, alternative="two-sided")
    print(f"{name} Wilcoxon statistic = {wilcoxon_statistic:.2f}  p-value = {wilcoxon_pvalue:.4f}")
    print(f"{name} Paired t-test statistic = {t_statistic:.2f}  p-value = {t_pvalue:.4f}")


ROUGE-1 Wilcoxon statistic = 32264.50  p-value = 0.0000
ROUGE-1 Paired t-test statistic = 23.06  p-value = 0.0000
ROUGE-2 Wilcoxon statistic = 37346.00  p-value = 0.0000
ROUGE-2 Paired t-test statistic = 15.58  p-value = 0.0000
ROUGE-L Wilcoxon statistic = 44085.00  p-value = 0.0000
ROUGE-L Paired t-test statistic = 19.10  p-value = 0.0000


In [19]:
def summary(sample_idx, data, model, tokenizer, model_resolved, tokenizer_resolved, device):
    text = data["test"][sample_idx]["dialogue"]
    resolved_text = data["test"][sample_idx]["resolved_text"]
    reference = data["test"][sample_idx]["summary"]

    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    output = model.generate(**inputs, max_length=128)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    inputs_resolved = tokenizer_resolved(resolved_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    output_resolved = model_resolved.generate(**inputs_resolved, max_length=128)
    summary_resolved = tokenizer_resolved.decode(output_resolved[0], skip_special_tokens=True)

    print("Input")
    print(text)
    print()
    print("Summary from Raw Model")
    print(summary)
    print()
    print("Summary from Anaphora Resolution Model")
    print(summary_resolved)
    print()
    print("Reference Summary")
    print(reference)
    print()


In [20]:
tokenizer = AutoTokenizer.from_pretrained("/content/t5-base-raw-finetuned")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/t5-base-raw-finetuned").to(device)

tokenizer_resolved = AutoTokenizer.from_pretrained("/content/t5-base-resolved-finetuned")
model_resolved = AutoModelForSeq2SeqLM.from_pretrained("/content/t5-base-resolved-finetuned").to(device)


In [21]:
summary(
    sample_idx=17,
    data=data,
    model=model,
    tokenizer=tokenizer,
    model_resolved=model_resolved,
    tokenizer_resolved=tokenizer_resolved,
    device=device
)


Input
Igor: Shit, I've got so much to do at work and I'm so demotivated. 
John: It's pretty irresponsible to give that much work to someone on their notice period.
Igor: Yeah, exactly! Should I even care?
John: It's up to you, but you know what they say...
Igor: What do you mean?
John: Well, they say how you end things shows how you really are...
Igor: And now how you start, right?
John: Gotcha! 
Igor: So what shall I do then? 
John: It's only two weeks left, so grit your teeth and do what you have to do. 
Igor: Easy to say, hard to perform.
John: Come on, stop thinking, start doing! 
Igor: That's so typical of you!  ;)  

Summary from Raw Model
John has been working for his job.

Summary from Anaphora Resolution Model
Igor has two weeks left to finish work.

Reference Summary
Igor has a lot of work on his notice period and he feels demotivated. John thinks he should do what he has to do nevertheless. 

