# Extractor

## Setup MongoDB

In [2]:
from pymongo import MongoClient

In [3]:
client = MongoClient(host="localhost", port=27017)
client.list_database_names()

['admin', 'ca', 'config', 'courses', 'local', 'my_store', 'subrata', 'test']

In [4]:
courses_db = client.get_database("courses")

print(courses_db.list_collection_names())

['CA-Inter-P5 - Chapter 2 - Audit Strategy, Audit Planning and Audit Programme', 'CA-P1-M1-C1-U1-Theory']


In [5]:
courses_db.drop_collection(
    name_or_collection="CA-Inter-P5 - Chapter 1 - Nature, Objective and Scope of Audit"
)

{'ok': 1.0}

In [6]:
courses_db.drop_collection(
    name_or_collection="CA-Inter-P5 - Chapter 1 - Nature, Objective and Scope of Audit"
)

{'ok': 1.0}

In [7]:
print(courses_db.list_collection_names())

['CA-Inter-P5 - Chapter 2 - Audit Strategy, Audit Planning and Audit Programme', 'CA-P1-M1-C1-U1-Theory']


In [8]:
ca_inter_p5_c1_collection = courses_db.get_collection(
    name="CA-Inter-P5 - Chapter 1 - Nature, Objective and Scope of Audit"
)

In [9]:
ca_inter_p5_c1_cursor = ca_inter_p5_c1_collection.find({})
ca_inter_p5_c1_cursor

<pymongo.synchronous.cursor.Cursor at 0x10445aba0>

In [11]:
for document in ca_inter_p5_c1_cursor:
    print(document)

In [10]:
from docx import Document
from typing import List, Dict


class DocxExtractor:
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.document = Document(docx=file_path)

    def extract_full_text(self) -> str:
        return "\n".join([para.text for para in self.document.paragraphs])

    def split_into_blocks(self, full_text: str) -> List[str]:
        return full_text.split("****")


class TextBlockProcessor:
    @staticmethod
    def process_blocks(blocks: List[str]) -> Dict[str, str]:
        text_blocks = {}

        # Skip the first empty block (before the first '****')
        blocks = [block.strip() for block in blocks if block.strip()]

        if blocks:
            # Extract chapter name up to the first newline
            first_block = blocks[0]
            chapter_name, _, remaining = first_block.partition("\n")
            text_blocks["chapter_name"] = chapter_name.strip()

            # Add the remaining content of the first block as the first section
            if remaining:
                text_blocks["1"] = remaining.strip()

            # Process the rest of the blocks
            for i, block in enumerate(blocks[1:], start=2):
                text_blocks[str(i)] = block

        return text_blocks


def extract_text_blocks(file_path: str) -> Dict[str, str]:
    extractor = DocxExtractor(file_path=file_path)
    full_text = extractor.extract_full_text()
    blocks = extractor.split_into_blocks(full_text=full_text)
    return TextBlockProcessor.process_blocks(blocks=blocks)


In [12]:
ca_inter_p5_c1_docx = extract_text_blocks(
    file_path="CA-Inter-P5 - Chapter 1 - Nature, Objective and Scope of Audit.docx"
)

In [14]:
ANSWERS_PROMPT = """Given the following text and a set of questions, generate detailed answers to each question based exclusively on the content provided in the text. Your answers should:
 
- Be accurate and directly reference information from the text.
- Avoid using any external knowledge or information not present in the text.
- Be clear and concise.
 
Your output must be in JSON format, structured as an array of objects. Each object should contain the following two fields:
 
- "question": The question text.
- "answer": The answer to the question based on the input text.
 
**Requirements:**
 
- Answer each question thoroughly using only the information from the input text.
- Do not include any introductions, explanations, or conclusions in your output.
- The output should be strictly in JSON format without any additional text.
 
**Input Text:**
 
[Insert the study material text here.]
 
**Questions:**
 
[Insert 4-5 questions here.]"""

```json
{
  "_id": ObjectId(),
  "chapterName": "Chapter Name",
  "sections": [
    {
      "sectionNumber": "1",
      "content": "Section content",
      "questions": [],
      "questionAnswerPairs": []
    },
    {
      "sectionNumber": "2",
      "content": "Section content",
      "questions": [],
      "questionAnswerPairs": []
    },
    // More sections...
  ]
}
````

```python
from pymongo import MongoClient
from bson import ObjectId

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["course_database"]


def create_chapter_collection(chapter_name):
    """Create a new collection for a chapter."""
    collection_name = f"chapter_{chapter_name.replace(' ', '_').lower()}"
    return db[collection_name]


def insert_initial_chapter(collection, chapter_data):
    """Insert the initial chapter data extracted from DOCX."""
    chapter_document = {
        "chapterName": chapter_data["chapter_name"],
        "sections": [
            {
                "sectionNumber": str(k),
                "content": v,
                "questions": [],
                "questionAnswerPairs": [],
            }
            for k, v in chapter_data.items()
            if k != "chapter_name"
        ],
    }
    result = collection.insert_one(chapter_document)
    return result.inserted_id


def get_chapter(collection, chapter_id):
    """Retrieve a chapter document."""
    return collection.find_one({"_id": chapter_id})


def update_section_questions(collection, chapter_id, section_number, questions):
    """Update a section with generated questions."""
    collection.update_one(
        {"_id": chapter_id, "sections.sectionNumber": section_number},
        {"$set": {"sections.$.questions": questions}},
    )


def update_section_qa_pairs(collection, chapter_id, section_number, qa_pairs):
    """Update a section with generated question-answer pairs."""
    collection.update_one(
        {"_id": chapter_id, "sections.sectionNumber": section_number},
        {"$set": {"sections.$.questionAnswerPairs": qa_pairs}},
    )


def generate_questions(section_content):
    """Placeholder for question generation logic."""
    # Implement your question generation logic here
    return ["Generated Question 1", "Generated Question 2"]


def generate_qa_pairs(questions):
    """Placeholder for question-answer pair generation logic."""
    # Implement your QA pair generation logic here
    return [{"question": q, "answer": f"Answer to {q}"} for q in questions]
```

In [15]:
ca_db = client["ca"]
client.list_database_names()

['admin', 'ca', 'config', 'courses', 'local', 'my_store', 'subrata', 'test']

In [16]:
ca_inter_p5_c1_collection = ca_db["ca_inter_p5_c1"]
ca_db.list_collection_names()

['ca_inter_p5_c1']

In [17]:
ca_inter_p5_c1_docx["chapter_name"]

'CHAPTER 1 -  NATURE, OBJECTIVE AND SCOPE OF ADULT'

In [18]:
def insert_initial_chapter(collection, chapter_data):
    """Insert the initial chapter data extracted from DOCX."""
    chapter_document = {
        "chapter_name": chapter_data["chapter_name"],
        "sections": [
            {
                "section_number": str(key),
                "content": value,
                "questions": [],
                "question_answer_pairs": [],
            }
            for key, value in chapter_data.items()
            if key != "chapter_name"
        ],
    }
    result = collection.insert_one(chapter_document)
    return result.inserted_id

In [19]:
insert_initial_chapter(
    collection=ca_inter_p5_c1_collection,
    chapter_data=ca_inter_p5_c1_docx,
)

ObjectId('67446637a238f1fdec0eff32')

In [80]:
ca_inter_p5_c1_cursor = ca_inter_p5_c1_collection.find({})

for document in ca_inter_p5_c1_cursor:
    print(document)

{'_id': ObjectId('67446637a238f1fdec0eff32'), 'chapter_name': 'CHAPTER 1 -  NATURE, OBJECTIVE AND SCOPE OF ADULT', 'sections': [{'section_number': '1', 'content': "INTRODUCTION\nWhat do such real-life situations highlight? Such instances underline importance of auditing in today's complex business environment. Be it investors desirous of investing their money in companies, shareholders anxious to know financial position of companies they have invested in, banks or financial institutions willing to lend funds to credit-worthy organizations, governments desirous of collecting taxes from trade and industry in accordance with applicable laws, trade unions negotiating with corporate managements for better wages or insurance companies wanting to settle property claims caused by fire or other disasters - range of diverse users in equally diverse fields rely upon audited financial statements.\nCan you figure out reason behind such reliance? It is due to the fact that audited financial statemen

In [83]:
sections = ca_inter_p5_c1_collection.find_one(
    {"chapter_name": "CHAPTER 1 -  NATURE, OBJECTIVE AND SCOPE OF ADULT"}
)

print(len(sections))
sections["sections"][0]

3


{'section_number': '1',
 'content': "INTRODUCTION\nWhat do such real-life situations highlight? Such instances underline importance of auditing in today's complex business environment. Be it investors desirous of investing their money in companies, shareholders anxious to know financial position of companies they have invested in, banks or financial institutions willing to lend funds to credit-worthy organizations, governments desirous of collecting taxes from trade and industry in accordance with applicable laws, trade unions negotiating with corporate managements for better wages or insurance companies wanting to settle property claims caused by fire or other disasters - range of diverse users in equally diverse fields rely upon audited financial statements.\nCan you figure out reason behind such reliance? It is due to the fact that audited financial statements provide confidence to users of financial statements; audited financial statements provide assurance to users who may take th

In [87]:
print(len(sections["sections"]))

27


In [None]:
counter = 0
for section in sections["sections"]:
    print(section["content"])
    counter += 1
counter

INTRODUCTION
What do such real-life situations highlight? Such instances underline importance of auditing in today's complex business environment. Be it investors desirous of investing their money in companies, shareholders anxious to know financial position of companies they have invested in, banks or financial institutions willing to lend funds to credit-worthy organizations, governments desirous of collecting taxes from trade and industry in accordance with applicable laws, trade unions negotiating with corporate managements for better wages or insurance companies wanting to settle property claims caused by fire or other disasters - range of diverse users in equally diverse fields rely upon audited financial statements.
Can you figure out reason behind such reliance? It is due to the fact that audited financial statements provide confidence to users of financial statements; audited financial statements provide assurance to users who may take their decisions on the basis of such audi

27

In [40]:
sections["sections"][0]["content"]

"INTRODUCTION\nWhat do such real-life situations highlight? Such instances underline importance of auditing in today's complex business environment. Be it investors desirous of investing their money in companies, shareholders anxious to know financial position of companies they have invested in, banks or financial institutions willing to lend funds to credit-worthy organizations, governments desirous of collecting taxes from trade and industry in accordance with applicable laws, trade unions negotiating with corporate managements for better wages or insurance companies wanting to settle property claims caused by fire or other disasters - range of diverse users in equally diverse fields rely upon audited financial statements.\nCan you figure out reason behind such reliance? It is due to the fact that audited financial statements provide confidence to users of financial statements; audited financial statements provide assurance to users who may take their decisions on the basis of such a

In [None]:
## Delete Objects
# from bson import ObjectId

# ca_inter_p5_c1_collection.delete_one({"_id": ObjectId("67445f12b1d4092f954d3d6f")})

DeleteResult({'n': 0, 'ok': 1.0}, acknowledged=True)

# Generate Questions

In [41]:
import os
from openai import AzureOpenAI

ai_client = AzureOpenAI(
    api_version="2024-08-01-preview",
    api_key=os.getenv(key="AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv(key="AZURE_OPENAI_ENDPOINT"),
)

In [55]:
QUESTIONS_PROMPT = f"""Analyze the following text and generate as many questions as possible to cover all aspects of the content. The questions should be of various types, including:
 
- Facts
- Logic
- Steps
- Examples
- Definitions
- Any other relevant types that help to create an overall understanding of the context involved.
 
Your output must be in JSON format. Each object should have the following two fields:
 
- "question": The question text.
- "type": The type of question (e.g., "fact", "logic", "steps", "example", "definition", etc.)
 
**Requirements:**
 
- The questions should be comprehensive and cover all important points in the text.
- The number of questions should be sufficient to thoroughly represent the input data.
- Do not include any introductions, explanations, or conclusions in your output.
- The output should be strictly in JSON format without any additional text.
 
**Input Text:**
{sections["sections"][0]["content"]}
"""

In [66]:
from pydantic import BaseModel
from openai import OpenAI

GPT_4O_MINI = "gpt-4o-mini"


class Question(BaseModel):
    question: str
    type: str


class Questions(BaseModel):
    questions: List[Question]


completion = ai_client.beta.chat.completions.parse(
    model=GPT_4O_MINI,
    messages=[
        {
            "role": "system",
            "content": "You are an assistant that generates questions based on given text.",
        },
        {
            "role": "user",
            "content": f"{QUESTIONS_PROMPT}",
        },
    ],
    response_format=Questions,
)

event = completion.choices[0].message.parsed


In [78]:
questions = event.model_dump()
questions["questions"]

[{'question': "What is the primary importance of auditing in today's complex business environment?",
  'type': 'definition'},
 {'question': 'Who are the diverse users that rely on audited financial statements?',
  'type': 'fact'},
 {'question': 'Why do investors rely on audited financial statements?',
  'type': 'logic'},
 {'question': 'How do audited financial statements provide assurance to users?',
  'type': 'logic'},
 {'question': 'What roles do banks and financial institutions play in relation to audited financial statements?',
  'type': 'fact'},
 {'question': 'In what ways do governments use audited financial statements?',
  'type': 'fact'},
 {'question': 'Why might trade unions be interested in audited financial statements?',
  'type': 'logic'},
 {'question': 'How do insurance companies utilize audited financial statements?',
  'type': 'fact'},
 {'question': "What is meant by the term 'auditing'?", 'type': 'definition'},
 {'question': 'What aspects of auditing will be explored in

In [82]:
from bson import ObjectId


def update_section_questions(collection, chapter_id, section_number, questions):
    """Update a section with generated questions."""
    collection.update_one(
        {"_id": ObjectId(chapter_id), "sections.section_number": str(section_number)},
        {"$set": {"sections.$.questions": questions}},
    )


update_section_questions(
    collection=ca_inter_p5_c1_collection,
    chapter_id="67446637a238f1fdec0eff32",
    section_number=1,
    questions=questions,
)


# Pipeline

In [94]:
import os
from openai import AzureOpenAI
from pydantic import BaseModel
from typing import List
from bson import ObjectId
from pymongo.errors import PyMongoError

# Step 1: Initialize Azure OpenAI client
ai_client = AzureOpenAI(
    api_version="2024-08-01-preview",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)


# Define models
class Question(BaseModel):
    question: str
    type: str


class Questions(BaseModel):
    questions: List[Question]


# Constants
GPT_4O_MINI = "gpt-4o-mini"


# Helper function to update MongoDB
def update_section_questions(collection, chapter_id, section_number, questions):
    try:
        result = collection.update_one(
            {
                "_id": ObjectId(chapter_id),
                "sections.section_number": str(section_number),
            },
            {"$set": {"sections.$.questions": questions}},
        )
        if result.matched_count == 0:
            print(
                f"No document found with id {chapter_id} and section number {section_number}"
            )
        elif result.modified_count == 0:
            print(f"Document found but not modified. Questions might be the same.")
        else:
            print(f"Successfully updated questions for section {section_number}")
    except PyMongoError as e:
        print(f"An error occurred while updating MongoDB: {e}")


# Main pipeline function
def process_sections(sections, chapter_id, collection):
    for index, section in enumerate(sections["sections"], start=1):
        print(f"\nProcessing Section {index}")

        # Step 2: Prepare the prompt
        QUESTIONS_PROMPT = f"""Analyze the following text and generate as many questions as possible to cover all aspects of the content. The questions should be of various types, including:

- Facts
- Logic
- Steps
- Examples
- Definitions
- Any other relevant types that help to create an overall understanding of the context involved.

Your output must be in JSON format. Each object should have the following two fields:

- "question": The question text.
- "type": The type of question (e.g., "fact", "logic", "steps", "example", "definition", etc.)

**Requirements:**

- The questions should be comprehensive and cover all important points in the text.
- The number of questions should be sufficient to thoroughly represent the input data.
- Do not include any introductions, explanations, or conclusions in your output.
- The output should be strictly in JSON format without any additional text.

**Input Text:**
{section["content"]}
"""

        try:
            # Step 3: Generate questions using Azure OpenAI
            completion = ai_client.beta.chat.completions.parse(
                model=GPT_4O_MINI,
                messages=[
                    {
                        "role": "system",
                        "content": "You are an assistant that generates questions based on given text.",
                    },
                    {
                        "role": "user",
                        "content": QUESTIONS_PROMPT,
                    },
                ],
                response_format=Questions,
            )

            event = completion.choices[0].message.parsed
            questions = event.model_dump()

            # Update MongoDB
            update_section_questions(
                collection=collection,
                chapter_id=chapter_id,
                section_number=section["section_number"],
                questions=questions["questions"],
            )

            print(
                f"Step {index} completed: Updated DB with section number {section['section_number']} questions"
            )

        except Exception as e:
            print(f"An error occurred while processing section {index}: {e}")


In [96]:
process_sections(
    sections=sections,
    chapter_id="67446637a238f1fdec0eff32",
    collection=ca_inter_p5_c1_collection,
)


Processing Section 1
Successfully updated questions for section 1
Step 1 completed: Updated DB with section number 1 questions

Processing Section 2
Successfully updated questions for section 2
Step 2 completed: Updated DB with section number 2 questions

Processing Section 3
Successfully updated questions for section 3
Step 3 completed: Updated DB with section number 3 questions

Processing Section 4
Successfully updated questions for section 4
Step 4 completed: Updated DB with section number 4 questions

Processing Section 5
Successfully updated questions for section 5
Step 5 completed: Updated DB with section number 5 questions

Processing Section 6
Successfully updated questions for section 6
Step 6 completed: Updated DB with section number 6 questions

Processing Section 7
Successfully updated questions for section 7
Step 7 completed: Updated DB with section number 7 questions

Processing Section 8
Successfully updated questions for section 8
Step 8 completed: Updated DB with sec