In [1]:
import datasets
import evaluate

rouge = evaluate.load("rouge")

# Long-form question answering dataset, nicely preprocessed already.
# Similar to ELI5: https://facebookresearch.github.io/ELI5/index.html (which is unavailable now)
# I use my filtered version
dataset_lfqa = datasets.load_dataset("stefanbschneider/lfqa-max-answer-length-512")
dataset_lfqa

README.md:   0%|          | 0.00/3.31k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/7.58M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/202767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2646 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'context'],
        num_rows: 202767
    })
    validation: Dataset({
        features: ['question', 'answer', 'context'],
        num_rows: 2646
    })
})

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
base_model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384")
tuned_model = AutoModelForSeq2SeqLM.from_pretrained("stefanbschneider/led-base-16384-lfqa-ans-len-512", revision="main")

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/231 [00:00<?, ?B/s]

In [3]:
example = dataset_lfqa["train"][0]
example

{'question': "what's the difference between a forest and a wood?",
 'answer': "They're used interchangeably a lot. You'll get different answers from different resources, but the general consensus seems to be that woods are smaller than forests.\n\n >  A wood is an area covered in trees, larger than a grove or a copse. A forest is also an area covered in trees, but it is larger than a wood\n\n >  The U.S. National Vegetation Classification system differentiates them according to their densities: 25 to 60 percent of a a wood is covered by tree canopies, while 60 to 100 percent of a forest is canopied.",
 'context': ['Wood is divided, according to its botanical origin, into two kinds: softwoods, from coniferous trees, and hardwoods, from broad-leaved trees. Softwoods are lighter and generally simple in structure, whereas hardwoods are harder and more complex. However, in Australia, "softwood" generally describes rain forest trees, and "hardwood" describes Sclerophyll species ("Eucalyptus"

In [4]:
input = f"question: {example['question']}, context: {' '.join(example['context'])}"    
tokens = tokenizer(input, return_tensors="pt")

# base model
base_model_output = base_model.generate(**tokens, max_length=512)
base_model_answer = tokenizer.decode(base_model_output[0], skip_special_tokens=True)
base_model_answer

Input ids are automatically padded from 727 to 1024 to be a multiple of `config.attention_window`: 1024


'question: what\'s the difference between a forest and a wood?, context: Wood is divided, according to its botanical origin, into two kinds: softwoods, from coniferous trees, and hardwoods, from broad-leaved trees. Softwoods are lighter and generally simple in structure, whereas hardwoods are harder and more complex. However, in Australia, "softwood" generally describes rain forest trees, and "hardwood" describes Sclerophyll species ("Eucalyptus" "spp"). Woodland is defined by Chambers English dictionary as "land covered with wood" i.e. dominated by tree species. Forestry is defined as "1. the science and art of planting, tending and managing forests; 2. Forest country". This implies that forests have been planted by mankind for a variety of purposes, but mostly for exploitation for timber and pulp for the paper industry. The majority of Forests in Wales were planted by the British Forestry Commission, a UK government agency. Since 2016 the Forestry Commission in Wales has been taken o

In [5]:
rouge.compute(predictions=[base_model_answer], references=[example['answer']])

{'rouge1': 0.18969072164948456,
 'rouge2': 0.033126293995859216,
 'rougeL': 0.11958762886597939,
 'rougeLsum': 0.15257731958762888}

In [6]:
# same for fine-tuned model
tuned_model_output = tuned_model.generate(**tokens, max_length=512)
tuned_model_answer = tokenizer.decode(tuned_model_output[0], skip_special_tokens=True)
tuned_model_answer

'A forest is an area of land covered with trees. A wood is a piece of land that is covered by trees.\n\nA tree is a part of a forest.\n\n_URL_0_\n\nThe difference between a forest and a wood is that a forest is more dense than a tree, and a tree is more complex than a wood. \n\nSo, a forest means that a tree grows in a way that allows it to grow in a more dense area. \nA wood means that it grows in an area where it can grow in more dense areas.  A forest is a place where a tree can grow and grow in less dense areas than a forest can grow. \n\n\nA forest means a place that grows in such dense areas that it can be used for a variety of purposes.  For example, a tree that grows on a tree will grow in such a way to support a tree.  The tree will be able to support the tree, but the tree will not be strong enough to support it.  \n\n\n\n\n\n"A forest" means a forest where a forest grows in the same way as a tree growing in a different area.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n > \n\n >\n\n\n >

In [7]:
rouge.compute(predictions=[tuned_model_answer], references=[example['answer']])

{'rouge1': 0.28070175438596495,
 'rouge2': 0.11586901763224182,
 'rougeL': 0.14035087719298248,
 'rougeLsum': 0.2606516290726817}

In [11]:
from transformers import pipeline

base_pipeline = pipeline(task="text2text-generation", model="allenai/led-base-16384")

Device set to use mps:0


In [12]:
base_pipeline("question: What is the capital of Germany?, context: Germany is a country in Europe. Its capital is Berlin.")

[{'generated_text': 'question: What is the capital of Germany?, context: Germany is a country in Europe. Its'}]

In [None]:
lfqa_pipeline = pipeline(task="text2text-generation", model="stefanbschneider/led-base-16384-lfqa-ans-len-512")


tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Device set to use mps:0


In [9]:
lfqa_pipeline("question: What is the capital of Germany?, context: Germany is a country in Europe. Its capital is Berlin.")

Input ids are automatically padded from 25 to 1024 to be a multiple of `config.attention_window`: 1024


[{'generated_text': 'The capital of Germany is Berlin.\n\nThe capital is Berlin, which is the capital of the German state of Brandenburg.\n\n_URL_0_\n\nGermany is the largest city in Europe, with a population of around 10 million.\n \n\nIt\'s the capital, Berlin. \n\n_\n_\n\n\n\n"Berlin" is the name of the city in Germany. "Berliner" is a German name for the city of Berlin.\n\n\n\n\n" Berlin" is an abbreviation of "Berlin", which means "German capital".\n\n\n\n\n\nThe name of Berlin is "Berliner", which is a name for Berlin.\n\n\n\nThe title of the "German Capital of Germany" is German for "Germany" and "German City of Berlin".\n\nBerlin is a city in the German capital of Berlin, Berlin is a country in Europe. \n\n\nGermany has a capital called Berlin, and a city called Berlin is Germany\'s capital.  \n\n\n\n\n\n_\n \n \n\n\n\n\n\n\n >\n\n\n >\n\n\n_\n\n\n >\n >\n\n\n >\r\n\n \n\n > \n > \n\n >\n\n\n\n \n  \n_ >\n > >\n_\r\n >_\n >;\n\n-\n\n--\n\n\n--\n\n*\n\n\\-\n\n---\n\nHope this is

In [10]:
print(input)
lfqa_pipeline(input)

question: what's the difference between a forest and a wood?, context: Wood is divided, according to its botanical origin, into two kinds: softwoods, from coniferous trees, and hardwoods, from broad-leaved trees. Softwoods are lighter and generally simple in structure, whereas hardwoods are harder and more complex. However, in Australia, "softwood" generally describes rain forest trees, and "hardwood" describes Sclerophyll species ("Eucalyptus" "spp").
 Woodland is defined by Chambers English dictionary as "land covered with wood" i.e. dominated by tree species. Forestry is defined as "1. the science and art of planting, tending and managing forests; 2. Forest country". This implies that forests have been planted by mankind for a variety of purposes, but mostly for exploitation for timber and pulp for the paper industry. The majority of Forests in Wales were planted by the British Forestry Commission, a UK government agency. Since 2016 the Forestry Commission in Wales has been taken ov

[{'generated_text': 'A forest is an area of land covered with trees. A wood is a piece of land that is covered by trees.\n\nA tree is a part of a forest.\n\n_URL_0_\n\nThe difference between a forest and a wood is that a forest is more dense than a tree, and a tree is more complex than a wood. \n\nSo, a forest means that a tree grows in a way that allows it to grow in a more dense area. \nA wood means that it grows in an area where it can grow in more dense areas.  A forest is a place where a tree can grow and grow in less dense areas than a forest can grow. \n\n\nA forest means a place that grows in such dense areas that it can be used for a variety of purposes.  For example, a tree that grows on a tree will grow in such a way to support a tree.  The tree will be able to support the tree, but the tree will not be strong enough to support it.  \n\n\n\n\n\n"A forest" means a forest where a forest grows in the same way as a tree growing in a different area.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\