# BART Baseline

---

## Setup

---

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd drive/MyDrive/CS\ 224N\ Project
%ls # verify that you are in the right directory

In [None]:
import torch
import numpy as np
import datasets
import pandas as pd
import os

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime
from datasets import Dataset

## Model and tokenizer

---

Download model and tokenizer. Use default parameters or try custom values (see [HF Bart configuration](https://huggingface.co/transformers/_modules/transformers/configuration_bart.html) and [Fairseq Bart](https://github.com/pytorch/fairseq/tree/master/examples/bart)).

In [None]:
model_name = 'facebook/bart-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenization
encoder_max_length = 256
decoder_max_length = 64

## Data

---

### Download

In [None]:
train_df = pd.read_csv('aita_train_set.csv')[['text', 'comments']]
valid_df = pd.read_csv('aita_valid_set.csv')[['text', 'comments']]
test_df = pd.read_csv('aita_test_set.csv')[['text', 'comments']]

In [None]:
train_data_txt = Dataset.from_pandas(train_df)
validation_data_txt = Dataset.from_pandas(valid_df)
test_data_txt = Dataset.from_pandas(test_df)
print(train_data_txt)
print(validation_data_txt)
print(test_data_txt)

Dataset({
    features: ['text', 'comments'],
    num_rows: 81614
})
Dataset({
    features: ['text', 'comments'],
    num_rows: 998
})
Dataset({
    features: ['text', 'comments'],
    num_rows: 998
})


### Prepare

**Preprocess and tokenize**

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["text"], batch["comments"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Map:   0%|          | 0/81614 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

## Training

---

### Metrics

In [None]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Training arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=1,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # changed from 4
    per_device_eval_batch_size=4,
    learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    # evaluation_strategy="steps",
    # eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    # metric_for_best_model="rouge1",
    # load_best_model_at_end=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### Train

Train the model

In [None]:
trainer.train()

In [None]:
model_name = "bart-base-checkpoint-204000"
model_dir = f"{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

In [None]:
from pprint import pprint

In [None]:
# Good samples: 368, 400
sample = 200
test_text = test_df.iloc[sample]['text']
test_comment = test_df.iloc[sample]['comments']
pprint('text:')
pprint(test_text)
pprint('comment:')
pprint(test_comment)

In [None]:
inputs = [test_text]

inputs = tokenizer(inputs, max_length=max_input_length, padding='max_length', truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
print(decoded_output)

In [None]:
!pip install evaluate

In [None]:
"""
Calculate rouge score
"""
import evaluate
rouge = evaluate.load('rouge')
inputs = [test_df.iloc[i]['text'] for i in range(100)]
inputs = tokenizer(inputs, max_length=max_input_length, padding=True, truncation=True, return_tensors='pt')
preds = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
targets = [test_df.iloc[i]['comments'] for i in range(100)]
results = rouge.compute(predictions=decoded_preds, references=targets)
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]