In [1]:
import os
import json
from pathlib import Path
import pandas as pd
import pymupdf
from tqdm.notebook import tqdm
import random
import jinja2
import string
import textwrap
from openai import OpenAI
from dotenv import load_dotenv

In [2]:
with open("cpv_divisions.json", "r", encoding="utf-8") as file:
    cpv_mapping = json.load(file)

In [3]:
current_dir = Path(os.getcwd())
pdf_paths = list(current_dir.rglob("*.pdf"))

data = {
    "file": [],
    "text": [],
    "page": [],
    "division_number": [],
    "division_title": [],
    "language": [],    
}

for p in tqdm(pdf_paths):
    try:
        cft_root = p.parts.index("docs") - 1
        file = "/".join(p.parts[cft_root:])
        
        doc = pymupdf.open(p)

        for idx, page in enumerate(doc):
            data["file"].append(file)
    
            data["text"].append(page.get_text())
        
            data["page"].append(idx+1)
            
            division_number = int(p.parts[p.parts.index("data") + 1])
            data["division_number"].append(division_number)
        
            division_title = cpv_mapping.get(str(division_number), None)
            data["division_title"].append(division_title)
        
            lang = p.parts[p.parts.index("data") + 2]
            data["language"].append(lang)

    except:
        print("Could not open file", file)

df = pd.DataFrame(data)
df.head()

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,file,text,page,division_number,division_title,language
0,243602/docs/ksw_111_weisse_linie_renamed/aeb_k...,Stand 2022-01 – 04.2022 \nSeite 1 von 10 \n \n...,1,15,"Food, beverages, tobacco and related products",DE
1,243602/docs/ksw_111_weisse_linie_renamed/aeb_k...,AEB Kantonsspital Winterthur Ausgabe 2022-01 \...,2,15,"Food, beverages, tobacco and related products",DE
2,243602/docs/ksw_111_weisse_linie_renamed/aeb_k...,AEB Kantonsspital Winterthur Ausgabe 2022-01 \...,3,15,"Food, beverages, tobacco and related products",DE
3,243602/docs/ksw_111_weisse_linie_renamed/aeb_k...,AEB Kantonsspital Winterthur Ausgabe 2022-01 \...,4,15,"Food, beverages, tobacco and related products",DE
4,243602/docs/ksw_111_weisse_linie_renamed/aeb_k...,AEB Kantonsspital Winterthur Ausgabe 2022-01 \...,5,15,"Food, beverages, tobacco and related products",DE


In [4]:
print(df.sample().text.item())

AEB Kantonsspital Winterthur Ausgabe 2022-01 
Stand 2022-01 – 04.2022 
Seite 8 von 10 
 
 
 
 
 
17 Rechnungsstellung / Zahlungskonditionen 
 
17.1 Rechnungsbeträge ohne Angaben der KSW-Bestellnummer, der bestellenden Stelle (Referenz- 
adresse) bzw. der Warenempfänger, Bestellpositionen, Stückzahlen, Lieferanten-Artikelnummer 
und Bezeichnung der Ware werden nicht fällig, solange die fehlenden Angaben nicht schriftlich 
nachgeliefert bzw. bestätigt werden. 
 
17.2 Sofern nichts anderes vereinbart wird, erfolgt die Zahlung nach Wahl des KSW innerhalb von 
45 Tagen nach Erhalt der gemäss den untenstehenden Vorgaben erstellten Rechnung mit 2% 
Skonto oder innerhalb von 60 Tagen nach Rechnungserhalt netto. Die Zahlungsfrist beginnt 
frühestens mit Eingang der den Anforderungen gemäss Ziff. 17.1 genügenden Rechnung, 
jedoch nicht vor Eingang der Lieferung bzw. Erbringung der Leistung. Vorbehalten bleibt die 
Verrechnung mit Gegenforderungen des KSW gegenüber dem Lieferanten 
 
17.3 Zusatzb

In [5]:
def generate_prompt(text, language):
    option_amt = random.randint(3, 7)
    option_labels = list(string.ascii_uppercase[:option_amt])
    
    template_str = textwrap.dedent(
        """\
        Based on the following text, generate one multiple-choice question with {{ option_amt }} options. Indicate the correct option explicitly.
        Language of the text and your output: {{ language }}
        The multiple-choice question and answer needs to fulfill the following criteria:
        Self-contained: The correct answer can be directly derived from the text without prior knowledge.
        Distractors: The incorrect options are plausible but contradicted by specific details in the text.
        Clarity: Each option references a specific, verifiable fact from the provided information.
        
        Text:
        {{ text }}

        Output Format (in JSON):
        {
            "question": "<Your question here>",
            "options": {
                {% for label in option_labels %}\
                "{{ label }}": "Option {{ loop.index }}"{% if not loop.last %},{% endif %}
                {% endfor %}\
            },
            "correct_answer": "{{ correct_label }}"
        }
        """
    )
    template = jinja2.Template(template_str)
    
    prompt = template.render(
        option_amt=option_amt,
        text=text,
        option_labels=option_labels,
        language= language
    )
    
    return prompt

In [6]:
# takes a list of prompts, sends them to the LLM and returns the chat completions.
def ask_LLM_and_extract_MQC(prompts:list):
    mcq_list = []
    for prompt, sample in prompts:
        load_dotenv()
        deepinfra_api_key = os.getenv("DEEPINFRA_API_KEY")
        deepinfra_base_url = os.getenv("DEEPINFRA_BASE_URL")
        openai = OpenAI(api_key=deepinfra_api_key,
                        base_url=deepinfra_base_url)

        response = openai.chat.completions.create(
                        model="meta-llama/Meta-Llama-3.1-405B-Instruct",
                        messages=[
                            {"role": "system", "content": "You are a knowledgeable assistant that only responds in the given output format."},
                            {"role": "user", "content": prompt}
                        ],
                        max_tokens=1500,
                        temperature=0.7,
                    ).choices[0].message.content.strip()
        mcq_question = extract_MCQ(response)
        if not mcq_question: # give the LLM an oppurtuinity to correct itself
            response = openai.chat.completions.create(
                        model="meta-llama/Meta-Llama-3.1-405B-Instruct",
                        messages=[
                            {"role": "system", "content": "You are a knowledgeable assistant that only responds in the given output format."},
                            {"role": "user", "content": prompt},
                            {"role": "assistant", "content": response},
                            {"role": "user", "content": "I was unable to convert your response into a json file. There is an issue in the format of your response. Please provide the multiple-choice question again, this time in correct JSON Format."}
                        ],
                        max_tokens=1500,
                        temperature=0.7,
                    ).choices[0].message.content.strip()
            mcq_question = extract_MCQ(response)
        if mcq_question:
            mcq_question["language"] = sample['language']
            mcq_question['context'] = sample
            mcq_list.append(mcq_question)
        else:
            print("Unable to provide a MCQ.")

    return mcq_list

def extract_MCQ(response):
    # Extracting MCQ from the answer.
    start = response.find("{")
    end = response.rfind("}")
    if start != -1 and end != -1:  # Ensure both are found
        json_content = response[start:end+1]  
        try:
            mcq_data = json.loads(json_content)
            return mcq_data
        except json.JSONDecodeError:
            return False  
    return False 

def save_to_dict(mcq_list, file_name="LLM_generated_multiple_choice_question.json"):
    path = "output/" + file_name
    with open(path, "w", encoding="utf-8") as f:
        json.dump(mcq_list, f, ensure_ascii=False, indent=4)
    print(f"Multiple-choice question saved to {path}")

In [7]:
def generate_MCQ_for_documents(file_name):
    prompts = []
    samples = df.to_dict(orient='records')
    for sample in samples:
        prompt = generate_prompt(sample['text'], sample['language'])
        prompts.append((prompt, sample))

    mcq_list = ask_LLM_and_extract_MQC(prompts)
    save_to_dict(mcq_list, file_name=file_name)

In [8]:
def shuffle_mcqs(mcq):
    shuffled_mcqs = []
    
    question = mcq['question']
    options = mcq['options']
    correct_answer = mcq['correct_answer']
    language = mcq['language']
    
    for _ in range(4):
        options_values = list(options.values()) 
        random.shuffle(options_values)
        shuffled_options = {label: options_values[i] for i, label in enumerate(options.keys())}
        new_correct_answer = next(label for label, option in shuffled_options.items() if option == options[correct_answer])
    
        shuffled_mcqs.append({
            'question': question,
            'options': shuffled_options,
            'correct_answer': new_correct_answer,
            'version': "reordered",
            'language': language,
            'context': mcq['context']
        })
    
    return shuffled_mcqs

def none_of_the_above_mcqs(mcq):    
    question = mcq['question']
    options = mcq['options']
    correct_answer = mcq['correct_answer']
    language = mcq['language']

    if language == 'DE':
        nota_str = "Keine der oben genannten Optionen"
    else:
        nota_str = "None of the above"
    options_values = list(options.values()) 
    options_values[len(options_values)-1] = nota_str
    options = {label: options_values[i] for i, label in enumerate(options.keys())}
    
    nota_mcq = {
        'question': question,
        'options': options,
        'correct_answer': correct_answer,
        'version': "nota",
        'language': language,
        'context': mcq['context']

    }
    
    return nota_mcq

def true_false(mcq):
    question = mcq['question']
    options = mcq['options']
    correct_answer = mcq['correct_answer']
    language = mcq['language']

    options_values = list(options.values())
    answer = random.choice(options_values)
    label = False
    if options[correct_answer] == answer:
        label = True

    if language == 'DE':
        true_false_prompt = "Ist das folgende Frage-Antwort Paar Richtig oder Falsch?"
    else: 
        true_false_prompt = "Is the following question-answer pair True or False?"

    true_false_question = {
        "true_false_prompt": true_false_prompt,
        "question": question,
        "answer": answer,
        "label": label,
        "version": "true_false",
        "language": language,
        'context': mcq['context']
    }
    return true_false_question

def enhance_mcqs(mcq_list):
    mcqs_plus = []
    for mcq in mcq_list:
        mcq_plus = shuffle_mcqs(mcq)
        mcq_plus[3] = none_of_the_above_mcqs(mcq_plus[3])
        mcq_plus.append(true_false(mcq))
        mcqs_plus.append(mcq_plus)
    return mcqs_plus


In [9]:
file_name = "MCQs.json"

generate_MCQ_for_documents(file_name)

path = "output/" + file_name
with open(path, "r") as f:
    mcq_list = json.load(f)

enhanced_mcqs = enhance_mcqs(mcq_list)
save_to_dict(enhanced_mcqs,file_name="enhanced.json")

Multiple-choice question saved to output/MCQs.json
Multiple-choice question saved to output/enhanced.json
