In [None]:
from parser.models.question import (
    Question,
    SubQuestion,
    SubSubQuestion,
    MultipleChoiceQuestion,
)
from parser.models.syllabus import Syllabus
from parser.sq_ms_parser import SQMSParser
from parser.sq_parser import QuestionPaperParser
from parser.syllabus_parser import SyllabusParser
import pdfplumber
from typing import List, Optional
import re
import tqdm
import os


In [None]:
class LLMClassifier:
    GUIDE = """
    你是一个考试大纲分类器, 你需要把考试问题分类到考试大纲中. 你会得到一个考试大纲和一组考试问题. 考试问题可能是多层嵌套的, 你需要把每个问题都分类到考试大纲中. 你只需要分类最小的问题单位(带Answer:), 不需要分类父问题.
    输入格式为Number:{question_number} Text:{question_description}( Answer:{question_answer}) 输出格式为{question_number(如果有父问题, 组合number, 用空格连接)}:{syllabus_number},每个question占一行, 不需要输出多余信息.
    """
    def __init__(
        self,
        api_key: str,
        api_url: str,
        syllabuses: List[Syllabus],
    ):
        self.syllabus = syllabuses
        self.syllabus_str = "\n\n".join([str(syl) for syl in syllabuses]) + "\n\n"
        self.api_key = api_key
        self.api_url = api_url

    def classify_all(
        self, questions: List[Question | MultipleChoiceQuestion]
    ) -> List[Question | MultipleChoiceQuestion]:
        if isinstance(questions[0], MultipleChoiceQuestion):
            text = self.format_mcq(questions)
        else:
            text = self.format_structured_question(questions)

    @staticmethod
    def format_structured_question(questions: List[Question]) -> str:
        output = ""
        sub = lambda s: re.sub(r"\.{3,}", "", re.sub(r"\[(\d+|(Total: \d+))\]", "", s)).strip()
        for q in questions:
            output += f"Number:{q.number} Text:{sub(q.text)}"
            if q.subquestions:
                output += "\n"
                for sub_q in q.subquestions:
                    text = sub_q.text
                    output += f"    Number:{sub_q.number} Text:{sub(text)}"
                    if sub_q.subsubquestions:
                        output += "\n"
                        for subsub_q in sub_q.subsubquestions:
                            text = subsub_q.text
                            output += f"        Number:{subsub_q.number} Text: {sub(text)} Answer:{subsub_q.answer or ''}\n"
                    else:
                        output += f" Answer:{sub_q.answer or ''}\n"
            else:
                output += f" Answer:{q.answer or ''}\n"
            output += "\n" + "-" * 80 + "\n"
        return output

    @staticmethod
    def format_mcq(questions: List[MultipleChoiceQuestion]) -> str:
        output = ""
        for q in questions:
            output += f"Number:{q.number} Text:{q.text}"
            if q.options:
                output += f" Options: {', '.join(q.options)}"
            if q.answer:
                output += f" Answer: {q.answer}"
            output += "\n\n"
        return output.strip()

In [15]:
with pdfplumber.open("papers/595426-2023-2025-syllabus.pdf") as syllabus_pdf:
        syllabus_parser = SyllabusParser(syllabus_pdf, pages=(12, 46))
        syllabuses = syllabus_parser.parse_syllabus()
with pdfplumber.open("papers/igcse-biology-0610/0610_w22_qp_42.pdf") as qppdf:
    sq_parser = QuestionPaperParser(qppdf, image_prefix="0610_w22_qp_42")
    questions = sq_parser.parse_question_paper()
sqms_parser = SQMSParser("papers/igcse-biology-0610/0610_w22_ms_42.pdf", questions)
questions = sqms_parser.parse_ms()

49.6063


[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m
[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/15) Page 1
[INFO] (2/15) Page 2
[INFO] (3/15) Page 3
[INFO] (4/15) Page 4
[INFO] (5/15) Page 5
[INFO] (6/15) Page 6
[INFO] (7/15) Page 7
[INFO] (8/15) Page 8
[INFO] (9/15) Page 9
[INFO] (10/15) Page 10
[INFO] (11/15) Page 11
[INFO] (12/15) Page 12
[INFO] (13/15) Page 13
[INFO] (14/15) Page 14
[INFO] (15/15) Page 15


In [23]:
apikey = os.getenv("API_KEY")
apiurl = os.getenv("API_URL")
classifier = LLMClassifier(apikey, apiurl, syllabuses)
# questions = classifier.classify_all(questions)
with open("output.txt", "w", encoding="utf-8") as f:
    f.write(LLMClassifier.format_question(questions))
