### Python Code for Lib

In [18]:
from retrigen import (
    filter_text_by_td,
    q_eval_system_prompt,
    q_eval_user_prompt,
    json_api_call,
    filter_text_by_llm,
    generate_question_template,
    question_api_call,
    QuestionContextManager,
    generate_questions,
    initialize_chroma_collection,
    add_documents_to_chroma,
    filter_context_candidates,
    c_eval_system_prompt,
    c_eval_user_prompt,
    context_question_llm_assesment,
)

### Code for testing and running all functions

In [9]:
from dotenv import load_dotenv
from datasets import load_dataset

# Contains OpenAI API key
load_dotenv(override=True)

True

In [6]:
# Load from hub
ds_vejledninger = load_dataset(
    "jealk/dk_retrieval_benchmark",
    "retsinformation",
    split="train",
    # download_mode="force_redownload",
)

# Create pandas dataframe from the dataset using the huggingface datasets library
df_vejledninger = ds_vejledninger.to_pandas()
df_vejledninger.head()

Unnamed: 0,url,title,html_content,text_content
0,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om regulering af satser fra 1. janu...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om regulering af satser fra 1. janu...
1,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om satser i 2024 for betaling af ud...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om satser i 2024 for betaling af ud...
2,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om obligatorisk selvbooking af jobs...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om obligatorisk selvbooking af jobs...
3,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning til bekendtgørelse om tilskud til s...,"<div class=""document-content"" id=""restylingRoo...",Vejledning til bekendtgørelse om tilskud til s...
4,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om fleksløntilskud m.v.,"<div class=""document-content"" id=""restylingRoo...",Vejledning om fleksløntilskud m.v.\n1.Indledni...


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base-v2")


def token_length_function(text_input):
    return len(tokenizer.encode(text_input, add_special_tokens=False))


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512, chunk_overlap=0, length_function=token_length_function, separators=["\n\n", "\n", ". ", "? ", "! "]
)

# For some reason, Langchains text splitter is horribly slow (compared to llamaindex) takes 2+ minutes to run on my CPU
split_documents = text_splitter.create_documents(
    list(df_vejledninger["text_content"]), metadatas=[{"title": title} for title in df_vejledninger["title"]]
)

Token indices sequence length is longer than the specified maximum sequence length for this model (1350 > 512). Running this sequence through the model will result in indexing errors


In [10]:
# Run documents through textdesctiptives quality check, returns the Documents that passed
docs_passed_td = filter_text_by_td(split_documents[0:300], filter_type=True)

In [11]:
# Run the documents through LLM quality check, return the documents that passed
docs_passed_llm = filter_text_by_llm(docs_passed_td[:50])

Evaluating texts: 100%|██████████| 50/50 [00:36<00:00,  1.36it/s]


In [12]:
# Generate questions for a sub-sample of the passed documents
qc_meta = generate_questions(docs_passed_llm[:10])

100%|██████████| 10/10 [00:19<00:00,  1.97s/it]


In [13]:
# Filter out documents wherein the questions are shorter or longer than specified character length and display 3 examples,
qc_meta.filter_questions_by_length(min_length=20, max_length=150)  # default values
qc_meta.display_question_context_pairs(3)

Removed 0 questions.
Question: Hvem skal regulere løbende erstatninger tilkendt før 1. januar 2024?

Context: De private arbejdsskadeforsikringsselskaber samt de arbejdsgivere, der er fritaget for at afgive risikoen efter loven, skal selv regulere løbende erstatninger, som er tilkendt før 1. januar 2024. Ved løbende erstatninger tilkendt i 2024 vil det fremgå af Arbejdsmarkedets Erhvervssikrings afgørelse, hvilke beløb, der skal udbetales i 2024.
----------------------------------------
Question: Hvordan beregnes grundlønnen for løbende erstatninger for tab af erhvervsevne ifølge Arbejdstilsynets bilag fra den 5. januar 2024?

Context: Arbejdstilsynet, den 5. januar 2024
Sine Frederiksen
/ Helle Klostergaard Christensen
Bilag 1
Bilaget indeholder eksempler på beregninger af kapitalerstatninger, godtgørelsesbeløb og overgangsbeløb samt løbende erstatninger og godtgørelser, som tilskadekomne eller dennes efterladte har ret til efter lov om arbejdsskadesikring, lov om sikring mod følger a

In [14]:
import chromadb

# Initializes a Chroma DB instance and adds the Contexts to the DB
chroma_client = chromadb.Client()
collection_name = "qc_collection"
embedding_model = "intfloat/multilingual-e5-base"

# For testing, delte previous collection
# chroma_client.delete_collection(collection_name)

db_collection = initialize_chroma_collection(chroma_client, collection_name, embedding_model)
add_documents_to_chroma(db_collection, qc_meta.contexts, document_prepend="passage:")

In [15]:
# Identify a list of context candidates that might be an additional 'match' to the generated questions
context_candidates_id = filter_context_candidates(db_collection, qc_meta, dist_threshold=0.05, include_origin_context=False)
context_candidates_id

{'ce7f0145-6b81-4993-bf04-ec70915545bd': [],
 'f728d57c-9839-4069-9bba-7bbba99d8e87': ['c357410f-4a94-44c4-9a8f-3810d5b958bd'],
 'a40606c7-f82a-484a-8372-dabf074b66c3': ['1f2971c7-495c-4b0b-8d3a-d2f37f0505d0'],
 '4222f9ec-74b7-4f58-a14f-12ed330e55c6': ['ad0802f9-381c-41f8-9b24-ac17313e5d66',
  'e1f09c75-b310-46e2-9d8c-74c2d6f5247b'],
 'adedee21-2125-49cd-9891-d387c4a4b70d': ['1f2971c7-495c-4b0b-8d3a-d2f37f0505d0',
  'e1f09c75-b310-46e2-9d8c-74c2d6f5247b'],
 'f7657ebe-228a-429b-89bd-a34f3660e583': [],
 '523e5d48-bf77-435e-ad2b-cf56eb56347d': ['ad0802f9-381c-41f8-9b24-ac17313e5d66',
  '1f2971c7-495c-4b0b-8d3a-d2f37f0505d0',
  'd3378632-21c2-4ab7-bdee-fdbf42b3fc7f'],
 '02516716-f8ae-4c33-ae9d-15cd255342d3': ['ad0802f9-381c-41f8-9b24-ac17313e5d66',
  'd3378632-21c2-4ab7-bdee-fdbf42b3fc7f'],
 'cdb5511b-fdfc-40c1-88a5-e449b603734f': ['ad0802f9-381c-41f8-9b24-ac17313e5d66'],
 '0bd4f8ae-3a2d-445f-98ad-46e9dd41220e': ['d3378632-21c2-4ab7-bdee-fdbf42b3fc7f',
  '4d72b0f8-abeb-4cd1-ab9c-5944ff383c

In [19]:
# Use LLM calls to assess whether the potential contexts does indeed contain the answer to the given question
question_context_matches = context_question_llm_assesment(context_candidates_id, qc_meta)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:11<00:00,  1.15s/it]


In [20]:
# Use the built-in class function to update the context question pairs, given the verified list
qc_meta.update_question_context_pairs(question_context_matches)

In [22]:
# View the updated list
qc_meta.question_context_id_pairs

{'ce7f0145-6b81-4993-bf04-ec70915545bd': ['c357410f-4a94-44c4-9a8f-3810d5b958bd'],
 'f728d57c-9839-4069-9bba-7bbba99d8e87': ['f10565b5-9e32-4f30-b6f2-75c879a0e6b0'],
 'a40606c7-f82a-484a-8372-dabf074b66c3': ['e1f09c75-b310-46e2-9d8c-74c2d6f5247b',
  '1f2971c7-495c-4b0b-8d3a-d2f37f0505d0'],
 '4222f9ec-74b7-4f58-a14f-12ed330e55c6': ['1f2971c7-495c-4b0b-8d3a-d2f37f0505d0'],
 'adedee21-2125-49cd-9891-d387c4a4b70d': ['a54cc7fb-c05a-4e02-950b-bd62e7218673',
  '1f2971c7-495c-4b0b-8d3a-d2f37f0505d0',
  'e1f09c75-b310-46e2-9d8c-74c2d6f5247b'],
 'f7657ebe-228a-429b-89bd-a34f3660e583': ['f75cb4e8-5f6a-48a4-88e4-1b3a5e9a6696'],
 '523e5d48-bf77-435e-ad2b-cf56eb56347d': ['768df1cd-6257-4b27-ba37-6ff4cde0fe3b',
  '1f2971c7-495c-4b0b-8d3a-d2f37f0505d0',
  'd3378632-21c2-4ab7-bdee-fdbf42b3fc7f'],
 '02516716-f8ae-4c33-ae9d-15cd255342d3': ['4d72b0f8-abeb-4cd1-ab9c-5944ff383cfe',
  'ad0802f9-381c-41f8-9b24-ac17313e5d66',
  'd3378632-21c2-4ab7-bdee-fdbf42b3fc7f'],
 'cdb5511b-fdfc-40c1-88a5-e449b603734f': [