In [3]:
import datasets
import evaluate

rouge = evaluate.load("rouge")

# Long-form question answering dataset, nicely preprocessed already.
# Similar to ELI5: https://facebookresearch.github.io/ELI5/index.html (which is unavailable now)
# I use my filtered version
dataset_lfqa = datasets.load_dataset("stefanbschneider/lfqa-max-answer-length-512")
dataset_lfqa

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'context'],
        num_rows: 202767
    })
    validation: Dataset({
        features: ['question', 'answer', 'context'],
        num_rows: 2646
    })
})

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
base_model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384")
tuned_model = AutoModelForSeq2SeqLM.from_pretrained("stefanbschneider/led-base-16384-lfqa-ans-len-512", revision="main")

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

In [5]:
example = dataset_lfqa["train"][0]
example

{'question': "what's the difference between a forest and a wood?",
 'answer': "They're used interchangeably a lot. You'll get different answers from different resources, but the general consensus seems to be that woods are smaller than forests.\n\n >  A wood is an area covered in trees, larger than a grove or a copse. A forest is also an area covered in trees, but it is larger than a wood\n\n >  The U.S. National Vegetation Classification system differentiates them according to their densities: 25 to 60 percent of a a wood is covered by tree canopies, while 60 to 100 percent of a forest is canopied.",
 'context': ['Wood is divided, according to its botanical origin, into two kinds: softwoods, from coniferous trees, and hardwoods, from broad-leaved trees. Softwoods are lighter and generally simple in structure, whereas hardwoods are harder and more complex. However, in Australia, "softwood" generally describes rain forest trees, and "hardwood" describes Sclerophyll species ("Eucalyptus"

In [6]:
input = f"question: {example['question']}, context: {' '.join(example['context'])}"    
tokens = tokenizer(input, return_tensors="pt")

# base model
base_model_output = base_model.generate(**tokens, max_length=512)
base_model_answer = tokenizer.decode(base_model_output[0], skip_special_tokens=True)
base_model_answer

Input ids are automatically padded from 727 to 1024 to be a multiple of `config.attention_window`: 1024


'question: what\'s the difference between a forest and a wood?, context: Wood is divided, according to its botanical origin, into two kinds: softwoods, from coniferous trees, and hardwoods, from broad-leaved trees. Softwoods are lighter and generally simple in structure, whereas hardwoods are harder and more complex. However, in Australia, "softwood" generally describes rain forest trees, and "hardwood" describes Sclerophyll species ("Eucalyptus" "spp"). Woodland is defined by Chambers English dictionary as "land covered with wood" i.e. dominated by tree species. Forestry is defined as "1. the science and art of planting, tending and managing forests; 2. Forest country". This implies that forests have been planted by mankind for a variety of purposes, but mostly for exploitation for timber and pulp for the paper industry. The majority of Forests in Wales were planted by the British Forestry Commission, a UK government agency. Since 2016 the Forestry Commission in Wales has been taken o

In [7]:
rouge.compute(predictions=[base_model_answer], references=[example['answer']])

{'rouge1': 0.18969072164948456,
 'rouge2': 0.033126293995859216,
 'rougeL': 0.11958762886597939,
 'rougeLsum': 0.15257731958762888}

In [None]:
# same for fine-tuned model
tuned_model_output = tuned_model.generate(**tokens, max_length=1024)
tuned_model_answer = tokenizer.decode(tuned_model_output[0], skip_special_tokens=True)
tuned_model_answer

'A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest is a forest that is not a forest. A forest i

In [9]:
rouge.compute(predictions=[tuned_model_answer], references=[example['answer']])

{'rouge1': 0.06093189964157705,
 'rouge2': 0.014388489208633094,
 'rougeL': 0.06093189964157705,
 'rougeLsum': 0.06093189964157705}

In [13]:
from transformers import pipeline

lfqa_pipeline = pipeline(task="text2text-generation", model="stefanbschneider/led-base-16384-lfqa-ans-len-512")


Device set to use mps:0


In [14]:
lfqa_pipeline("question: What is the capital of Germany?, context: Germany is a country in Europe. Its capital is Berlin.")

Input ids are automatically padded from 25 to 1024 to be a multiple of `config.attention_window`: 1024


[{'generated_text': 'Germany is a country in Europe. Germany is a country in Europe. Germany is a country in'}]

In [15]:
print(input)
lfqa_pipeline(input)

question: what's the difference between a forest and a wood?, context: Wood is divided, according to its botanical origin, into two kinds: softwoods, from coniferous trees, and hardwoods, from broad-leaved trees. Softwoods are lighter and generally simple in structure, whereas hardwoods are harder and more complex. However, in Australia, "softwood" generally describes rain forest trees, and "hardwood" describes Sclerophyll species ("Eucalyptus" "spp").
 Woodland is defined by Chambers English dictionary as "land covered with wood" i.e. dominated by tree species. Forestry is defined as "1. the science and art of planting, tending and managing forests; 2. Forest country". This implies that forests have been planted by mankind for a variety of purposes, but mostly for exploitation for timber and pulp for the paper industry. The majority of Forests in Wales were planted by the British Forestry Commission, a UK government agency. Since 2016 the Forestry Commission in Wales has been taken ov

[{'generated_text': 'Woodland is a forest, but wood is a forest. Woodland is a forest, but'}]