In [1]:
import os
import pandas as pd
import numpy as np
from dynaconf import Dynaconf

from elqm import ELQMPipeline
from elqm.utils import get_configs_dir, get_data_dir, get_cache_dir

In [2]:
elqm = ELQMPipeline(config=Dynaconf(settings_files=os.path.join(get_configs_dir(), "256_nlc_bge_meta_fn_enrich.yaml")))

Cache key: 256_bge_fn_meta_enrich
No cache found for 256_bge_fn_meta_enrich


Loading documents: 100%|██████████| 550/550 [00:00<00:00, 1557.61it/s]
Transform Footnotes in HTML Docs: 100%|██████████| 550/550 [00:15<00:00, 35.12it/s]
Removing HTML tags: 100%|██████████| 550/550 [00:10<00:00, 51.27it/s]
Enriching Text with Metadata: 100%|██████████| 68126/68126 [00:00<00:00, 794206.36it/s]




Saving documents: 100%|██████████| 68126/68126 [00:02<00:00, 30568.89it/s]


Loading documents:


100%|██████████| 68126/68126 [02:12<00:00, 515.71it/s]


Created Document Loader DirectoryLoader
Loaded 68126 documents


  from .autonotebook import tqdm as notebook_tqdm


Created Embedding HuggingFaceEmbeddings


Creating FAISS vectorstores: 100%|██████████| 682/682 [02:55<00:00,  3.88it/s]


Created Retriever VectorStoreRetriever


In [3]:
documents = elqm.loader.load()

100%|██████████| 68126/68126 [02:06<00:00, 538.82it/s]


In [4]:
from elqm.eval.oracle import generate_question_answer_pairs

In [5]:
print(documents[2].metadata['text'])

Community market. The Member States shall notify those provisions to the Commission by 20 November 2010 and shall notify it without delay of any subsequent amendment affecting them.
Article 21
Review
Not later than 2012, the Commission shall review the


In [6]:
generate_question_answer_pairs(
    context=documents[2].metadata['text'],
    prompt=None,  # default
    question_type=None,  # default
    n=1,
    verbose=True)

{QUESTION 1}: What is the deadline for notifying the Commission of any amendments to the provisions related to the Community market? {ANSWER 1}: Not later than 2012, the Commission shall review the provisions.

[('What is the deadline for notifying the Commission of any amendments to the provisions related to the Community market? ',
  'Not later than 2012, the Commission shall review the provisions.')]

In [7]:
from langchain_core.documents import Document
from tqdm import tqdm

import csv
import json
import os
import re
import shutil
import tempfile
import textwrap

import numpy as np
from bs4 import BeautifulSoup
from langchain.document_loaders import DirectoryLoader, JSONLoader
from langchain.llms import Ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm

from elqm.utils import get_data_dir

In [8]:
QUSTION_TYPES = {
    "confirmation": "Focus only on confirmation questions, i.e. questions that can be answered with a yes or no.",
    "factoid": "Focus only on factoid questions, that usually begin with a who, what, where, when, why, or how.",
    "list": "Focus only on list questions, i.e. questions that are answered with a list of items.",
    "causal": "Focus only on causal questions, i.e. questions that begin with why or how.",
    "hypothetical": "Focus only on hypothetical questions, i.e. questions that ask what if.",
    "complex": "Focus only on complex questions, i.e. questions that require multi-step reasoning and comparisons.",
    "default": ""
}


def generate_question_answer_pairs(context: str, prompt: str | None = None, question_type: str | None = None, n: int = 1, verbose: bool = False) -> list[tuple[str, str]]:
    """
    Generate question-answer pairs from a given context using the Ollama model.

    Parameters
    ----------
    context : str
        The context from which to generate the question-answer pairs.
    prompt : str, optional
        The prompt to use for the Ollama model. If None, a default prompt is used.
    question_type : str, optional
        The type of questions to generate. If None, any type of question is generated.
    n : int, optional
        The number of question-answer pairs to generate.
    verbose : bool, optional
        Whether to print the output of the Ollama model.

    Returns
    -------
    list[tuple[str, str]]
        A list of question-answer pairs.
    """
    if prompt is None:

        question_type_prompt = QUSTION_TYPES.get(question_type, QUSTION_TYPES["default"])  # type: ignore

        prompt = textwrap.dedent(f'''You are an oracle system or a Retrieval Augmented Generation System, that guesses questions that would answered by a particular exerpt of text.
            Fiven the following exerpt of text, generate {n} question{"s" if n > 1 else ""} that can be answered by the exerpt of text.
            ```
            {context}
            ```
             {question_type_prompt} \
            Format the pairs as follows:
            ```
            {{QUESTION i}}: <question i> {{ANSWER i}}: <answer i>
            ```
            Do not add any additional newlines between the pairs. Directly continue with the answer after the question.
            Only add a newline between the pairs (after the answer) if you want to add more pairs.
            Do not deviate from this format, since it will be used to extract the questions with the following regex: `{{QUESTION \\d+}}: .+ {{ANSWER \\d+}}: .+`
            ''')

    # Clear the message history by initializing a new Ollama instance
    ollama = Ollama(
        base_url="http://localhost:11434",
        model="llama2",
        verbose=True,
        stop=["<|im_end|>"]
    )

    if verbose:
        # Stream the output
        response = ""
        for token in ollama.stream(prompt):
            response += token
            print(token, end="")
    else:
        # Generate the question-answer pairs
        response = ollama.invoke(prompt)

    # Filter out the question-answer pairs
    qa_strings = re.findall(r'{QUESTION \d+}: .+ {ANSWER \d+}: .+', response)

    # Extract the question and answer from the question-answer pairs
    qa_pairs = []
    for qa_pair in qa_strings:
        question, answer = qa_pair.split("{ANSWER")[0].split("}: ")[1], qa_pair.split("{ANSWER")[1].split("}: ")[1]

        # Skip empty questions or answers
        if question != "" and answer != "":
            qa_pairs.append((question, answer))
        elif verbose:
            print(f"Question or answer is empty: {qa_pair}")

    # Check if any question-answer pairs were generated
    if len(qa_pairs) == 0 and verbose:
        print(f"No question-answer pairs were generated: {qa_pairs}")

    return qa_pairs

In [9]:
def generate_oracle_dataset(documents: list[Document], question_type: str | list[str] | None = None, n_questions_per_type: int = 1, verbose: bool = False) -> list[Document]:
    """
    Generate a dataset of question-answer pairs from a given directory of data.

    Parameters
    ----------
    documents : list[Document]
        The list of documents to generate question-answer pairs from.
    question_type : str | list[str], optional
        The type of question to generate. If None, all types are used, by default None
    n_questions_per_type : int, optional
        The number of questions to generate per question type, by default 1
    verbose : bool, optional
        Whether to print progress, by default False
    """

    if isinstance(question_type, str):
        question_type = [question_type]
    elif question_type is None:
        question_type = list(QUSTION_TYPES.keys())

    # Generate question-answer pairs for each document
    for document in tqdm(documents, desc="Generating question-answer pairs", disable=not verbose):

        # If a raw text is available (as in the case of enrichment), use the raw text stored in the metadata
        if 'text' in document.metadata:
            context = document.metadata['text']
        # Otherwise, use the page content
        else:
            context = document.page_content

        document.metadata['oracle_pairs'] = []

        for qt in question_type:
            pairs = generate_question_answer_pairs(context, question_type=qt, n=n_questions_per_type, verbose=False)
            for pair in pairs:
                document.metadata['oracle_pairs'].append({'question': pair[0], 'answer': pair[1], 'type': qt})
                
    return documents

In [10]:
oracle_documents = generate_oracle_dataset(documents[:10], n_questions_per_type=1, verbose=True)

Generating question-answer pairs: 100%|██████████| 10/10 [00:43<00:00,  4.32s/it]


In [11]:
oracle_documents[0]

Document(page_content='Document content:\nPortuguese Republic.\nDone at Lisbon on the seventeenth day of December in the year one thousand nine hundred and ninety-four.\nFait à Lisbonne, le dix-sept décembre mil neuf cent quatre-vingt-quatorze.\nGeschehen zu Lissabon am siebzehnten Dezember\nEUROVOC descriptor:environmental policy,energy policy,ECSC,EAEC,accession to an agreement,European charter\nDate of document:23/09/1997', metadata={'source': '/home/psaegert/Projects/elqm-INLPT-WS2023/cache/256_bge_fn_meta_enrich/preprocessed_documents/31998D0181_612.json', 'seq_num': 1, 'date_of_document': '1997-09-23', 'date_of_effect': '1997-09-23', 'date_of_signature': '', 'date_of_end_of_validity': 'No end date', 'author': 'Council of the European Union, European Commission', 'form': 'Decision', 'internal_comment': '', 'depositary': '', 'CELEX_ID': '31998D0181', 'text': 'Portuguese Republic.\nDone at Lisbon on the seventeenth day of December in the year one thousand nine hundred and ninety-fou