# Extractor

## Extract Text Blocks using ****

In [1]:
from docx import Document
import re

In [2]:
text_blocks: dict = {}

In [3]:
file_path = "CA-P1-M1-C1-U1-Theory.docx"

In [4]:
dir(Document())

['_Document__body',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_block_width',
 '_body',
 '_element',
 '_parent',
 '_part',
 'add_heading',
 'add_page_break',
 'add_paragraph',
 'add_picture',
 'add_section',
 'add_table',
 'core_properties',
 'element',
 'inline_shapes',
 'iter_inner_content',
 'paragraphs',
 'part',
 'save',
 'sections',
 'settings',
 'styles',
 'tables']

In [5]:
help(request=Document)

Help on function Document in module docx.api:

Document(docx: 'str | IO[bytes] | None' = None) -> 'DocumentObject'
    Return a |Document| object loaded from `docx`, where `docx` can be either a path
    to a ``.docx`` file (a string) or a file-like object.

    If `docx` is missing or ``None``, the built-in default document "template" is
    loaded.



In [6]:
doc: Document = Document(docx=file_path)

In [7]:
sections = doc.sections
type(sections)

docx.section.Sections

In [8]:
doc.paragraphs[0].text

'UNIT - 1 MEANING AND SCOPE OF ACCOUNTING'

In [9]:
list(doc.paragraphs)

[<docx.text.paragraph.Paragraph at 0x10e3c26f0>,
 <docx.text.paragraph.Paragraph at 0x10e3c26c0>,
 <docx.text.paragraph.Paragraph at 0x10e3c2630>,
 <docx.text.paragraph.Paragraph at 0x10e3c2150>,
 <docx.text.paragraph.Paragraph at 0x10e3c24e0>,
 <docx.text.paragraph.Paragraph at 0x10e3c1dc0>,
 <docx.text.paragraph.Paragraph at 0x10e3c1ee0>,
 <docx.text.paragraph.Paragraph at 0x10e3c0980>,
 <docx.text.paragraph.Paragraph at 0x10e3c2de0>,
 <docx.text.paragraph.Paragraph at 0x10e3c28a0>,
 <docx.text.paragraph.Paragraph at 0x10e3c2d50>,
 <docx.text.paragraph.Paragraph at 0x10e3c24b0>,
 <docx.text.paragraph.Paragraph at 0x10e3c2a20>,
 <docx.text.paragraph.Paragraph at 0x10e3c2450>,
 <docx.text.paragraph.Paragraph at 0x10e3c2960>,
 <docx.text.paragraph.Paragraph at 0x10e3c23f0>,
 <docx.text.paragraph.Paragraph at 0x10e3c2810>,
 <docx.text.paragraph.Paragraph at 0x10e3c2390>,
 <docx.text.paragraph.Paragraph at 0x10e3c2e40>,
 <docx.text.paragraph.Paragraph at 0x10e3c2c30>,
 <docx.text.paragrap

In [10]:
paragraphs: list = []
for paragraph in doc.paragraphs:
    paragraphs.append(paragraph.text)

In [11]:
len(paragraphs)

225

In [12]:
paragraphs[0]

'UNIT - 1 MEANING AND SCOPE OF ACCOUNTING'

In [13]:
def extract_text_blocks(file_path) -> dict:
    """
    Extracts text blocks from a .docx file separated by '****' delimiters.

    Args:
        file_path (str): The path to the .docx file.

    Returns:
        dict: A dictionary containing the extracted text blocks.
    """

    # Initialize an empty dictionary to store the extracted text blocks
    text_blocks: dict = {}

    # Open the .docx file
    doc: Document = Document(docx=file_path)

    # Extract the text from the .docx file
    full_text: list = []
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Join the extracted text into a single string
    full_text = "\n".join(full_text)

    # Split the full text into blocks based on the '****' delimiter
    blocks = full_text.split("****")

    # The first block is the title
    text_blocks["title"] = blocks[0].strip()

    # Process the remaining blocks
    for i in range(1, len(blocks)):
        # Split the block text by newline and get the first line
        first_line = blocks[i].strip().split("\n")[0][:50]

        # Ensure the key starts with a digit, otherwise skip non-digit characters
        key: str = re.sub(pattern=r"^\D+", repl="", string=first_line).strip()

        # The value is the block itself
        value = blocks[i].strip()
        # Add the key-value pair to the dictionary
        if key:  # Only add if key is not empty
            text_blocks[key] = value

    return text_blocks

In [14]:
def extract_text_blocks(file_path) -> dict:
    """
    Extracts text blocks from a .docx file separated by '****' delimiters.

    Args:
        file_path (str): The path to the .docx file.

    Returns:
        dict: A dictionary containing the extracted text blocks.
    """

    # Initialize an empty dictionary to store the extracted text blocks
    text_blocks: dict = {}

    # Open the .docx file
    doc: Document = Document(docx=file_path)

    # Extract the text from the .docx file
    full_text: list = []
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Join the extracted text into a single string
    full_text = "\n".join(full_text)

    # Split the full text into blocks based on the '****' delimiter
    blocks = full_text.split("****")

    # The first block is the title
    text_blocks["title"] = blocks[0].strip()

    # Process the remaining blocks
    counter = 1  # Initialize a counter for sequential numbering
    for i in range(1, len(blocks)):
        # Skip empty blocks
        if not blocks[i].strip():
            continue

        # The value is the block itself
        value = blocks[i].strip()

        # Add the key-value pair to the dictionary with sequential numbering
        if value:  # Only add if value is not empty
            text_blocks[counter] = value
            counter += 1

    return text_blocks

In [15]:
from pprint import pprint

file_path = "CA-P1-M1-C1-U1-Theory.docx"
text_blocks: dict = extract_text_blocks(file_path=file_path)
pprint(text_blocks)

{1: 'O) 1.1 INTRODUCTION\n'
    'Every individual performs some kind of economic activity. A salaried '
    'person gets salary and spends to buy provisions and clothing, for '
    "children's education, construction of house, etc. A sports club formed by "
    'a group of individuals, a business run by an individual or a group of '
    'individuals, a company running a business in telecom sector, a local '
    'authority like Calcutta Municipal Corporation, Delhi Development '
    'Authority, Governments, either Central or State, all are carrying some '
    'kind of economic activities. Not necessarily all the economic activities '
    'are run for any individual benefit; such economic activities may create '
    'social benefit i.e. benefit for the public, at large. Anyway, such '
    "economic activities are performed through 'transactions and events'. "
    "Transaction is used to mean 'a business, performance of an act, an "
    "agreement' while event is used to mean 'a happening

In [61]:
list(text_blocks.keys())

['title', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

In [63]:
text_blocks["title"]

'UNIT - 1 MEANING AND SCOPE OF ACCOUNTING'

In [64]:
text_blocks[1]

"O) 1.1 INTRODUCTION\nEvery individual performs some kind of economic activity. A salaried person gets salary and spends to buy provisions and clothing, for children's education, construction of house, etc. A sports club formed by a group of individuals, a business run by an individual or a group of individuals, a company running a business in telecom sector, a local authority like Calcutta Municipal Corporation, Delhi Development Authority, Governments, either Central or State, all are carrying some kind of economic activities. Not necessarily all the economic activities are run for any individual benefit; such economic activities may create social benefit i.e. benefit for the public, at large. Anyway, such economic activities are performed through 'transactions and events'. Transaction is used to mean 'a business, performance of an act, an agreement' while event is used to mean 'a happening, as a consequence of transaction(s), a result.'\nExample 1\nAn individual invests ₹ 2,00,000 f

In [62]:
text_blocks["1.2.1 Procedural aspects of Accounting"]

KeyError: '1.2.1 Procedural aspects of Accounting'

In [35]:
from pprint import pprint

pprint(object=text_blocks[list(text_blocks.keys())[1]])

('O) 1.1 INTRODUCTION\n'
 'Every individual performs some kind of economic activity. A salaried person '
 "gets salary and spends to buy provisions and clothing, for children's "
 'education, construction of house, etc. A sports club formed by a group of '
 'individuals, a business run by an individual or a group of individuals, a '
 'company running a business in telecom sector, a local authority like '
 'Calcutta Municipal Corporation, Delhi Development Authority, Governments, '
 'either Central or State, all are carrying some kind of economic activities. '
 'Not necessarily all the economic activities are run for any individual '
 'benefit; such economic activities may create social benefit i.e. benefit for '
 'the public, at large. Anyway, such economic activities are performed through '
 "'transactions and events'. Transaction is used to mean 'a business, "
 "performance of an act, an agreement' while event is used to mean 'a "
 "happening, as a consequence of transaction(s), a re

In [23]:
pprint(object=text_blocks[list(text_blocks.keys())[2]])

('1.2.1 Procedural aspects of Accounting\n'
 'On the basis of the above definitions, procedure of accounting can be '
 'basically divided into two parts:\n'
 '(i) Generating financial information and\n'
 '(ii) Using the financial information.\n'
 'Generating Financial Information\n'
 'Recording - This is the basic function of accounting. All business '
 'transactions of a financial character, as evidenced by some documents such '
 'as sales bill, pass book, salary slip etc. are recorded in the books of '
 'account. Recording is done in a book called "Journal." This book may further '
 'be divided into several subsidiary books according to the nature and size of '
 'the business. Students will learn how to prepare journal and various '
 'subsidiary books in chapter 2.\n'
 'Classifying - Classification is concerned with the systematic analysis of '
 'the recorded data, with a view to group transactions or entries of one '
 'nature at one place so as to put information in compact and usab

## Extract Text Blocks using Headings

In [144]:
# Initialize a dictionary to store content by headings and subheadings
content_by_section = {}
current_heading = None
current_subheading = None

# Debugging: Print the style of each paragraph to check style names
for para in doc.paragraphs:
    print(f"Text: {para.text}, Style: {para.style.name}")

# Iterate through all paragraphs in the document
for para in doc.paragraphs:
    # Check if the paragraph is a heading
    if para.style.name.startswith("Heading 1"):  # Main Heading
        current_heading = para.text.strip()
        content_by_section[current_heading] = {}
        current_subheading = None  # Reset subheading
    elif para.style.name.startswith("Heading 2"):  # Subheading
        if current_heading:
            current_subheading = para.text.strip()
            content_by_section[current_heading][current_subheading] = []
    else:
        # If it's normal text, add it under the current heading/subheading
        if current_heading and current_subheading:
            content_by_section[current_heading][current_subheading].append(
                para.text.strip()
            )
        elif current_heading:
            # If no subheading, add it directly under the main heading
            if current_heading not in content_by_section:
                content_by_section[current_heading] = []
            content_by_section[current_heading].append(para.text.strip())

# Print the structured content
for heading, subcontent in content_by_section.items():
    print(f"\nHeading: {heading}")
    if isinstance(subcontent, dict):
        for subheading, content in subcontent.items():
            print(f"  Subheading: {subheading}")
            for line in content:
                print(f"    {line}")
    else:
        for line in subcontent:
            print(f"  {line}")


Text: UNIT - 1 MEANING AND SCOPE OF ACCOUNTING, Style: Normal
Text: ****, Style: Normal
Text: O) 1.1 INTRODUCTION, Style: Normal
Text: Every individual performs some kind of economic activity. A salaried person gets salary and spends to buy provisions and clothing, for children's education, construction of house, etc. A sports club formed by a group of individuals, a business run by an individual or a group of individuals, a company running a business in telecom sector, a local authority like Calcutta Municipal Corporation, Delhi Development Authority, Governments, either Central or State, all are carrying some kind of economic activities. Not necessarily all the economic activities are run for any individual benefit; such economic activities may create social benefit i.e. benefit for the public, at large. Anyway, such economic activities are performed through 'transactions and events'. Transaction is used to mean 'a business, performance of an act, an agreement' while event is used to

## Connect with MongoDB

In [80]:
from pymongo import MongoClient

client = MongoClient(host="localhost", port=27017)
client.list_database_names()

['admin', 'config', 'local', 'my_store', 'subrata', 'test']

In [81]:
courses_db = client["courses"]

In [82]:
courses_db.list_collection_names()

[]

In [83]:
# courses_db.drop_collection("CA-P1-M1-C1-U1-Theory")

In [84]:
cap1_collection = courses_db["CA-P1-M1-C1-U1-Theory"]

cap1_collection


Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'courses'), 'CA-P1-M1-C1-U1-Theory')

In [85]:
courses_db.list_collection_names()

[]

In [86]:
type(text_blocks)

dict

In [32]:
# Convert text_blocks dictionary to a list of documents
documents = [{"key": key, "value": value} for key, value in text_blocks.items()]

documents


[{'key': 'title', 'value': ''},
 {'key': 1,
  'value': "CHAPTER 1 -  NATURE, OBJECTIVE AND SCOPE OF ADULT\nINTRODUCTION\nWhat do such real-life situations highlight? Such instances underline importance of auditing in today's complex business environment. Be it investors desirous of investing their money in companies, shareholders anxious to know financial position of companies they have invested in, banks or financial institutions willing to lend funds to credit-worthy organizations, governments desirous of collecting taxes from trade and industry in accordance with applicable laws, trade unions negotiating with corporate managements for better wages or insurance companies wanting to settle property claims caused by fire or other disasters - range of diverse users in equally diverse fields rely upon audited financial statements.\nCan you figure out reason behind such reliance? It is due to the fact that audited financial statements provide confidence to users of financial statements; a

In [88]:
cap1_collection.insert_many(documents=documents)

InsertManyResult([ObjectId('673de4343c729bab52f572fc'), ObjectId('673de4343c729bab52f572fd'), ObjectId('673de4343c729bab52f572fe'), ObjectId('673de4343c729bab52f572ff'), ObjectId('673de4343c729bab52f57300'), ObjectId('673de4343c729bab52f57301'), ObjectId('673de4343c729bab52f57302'), ObjectId('673de4343c729bab52f57303'), ObjectId('673de4343c729bab52f57304'), ObjectId('673de4343c729bab52f57305'), ObjectId('673de4343c729bab52f57306'), ObjectId('673de4343c729bab52f57307'), ObjectId('673de4343c729bab52f57308'), ObjectId('673de4343c729bab52f57309'), ObjectId('673de4343c729bab52f5730a'), ObjectId('673de4343c729bab52f5730b'), ObjectId('673de4343c729bab52f5730c'), ObjectId('673de4343c729bab52f5730d'), ObjectId('673de4343c729bab52f5730e')], acknowledged=True)

In [1]:
cap1_collection.find({})

NameError: name 'cap1_collection' is not defined

# Extract CA-Inter-P5

In [22]:
from pprint import pprint

file_path = "CA-Inter-P5 - Chapter 1 - Nature, Objective and Scope of Audit.docx"
text_blocks: dict = extract_text_blocks(file_path=file_path)
pprint(object=text_blocks)

{1: 'CHAPTER 1 -  NATURE, OBJECTIVE AND SCOPE OF ADULT\n'
    'INTRODUCTION\n'
    'What do such real-life situations highlight? Such instances underline '
    "importance of auditing in today's complex business environment. Be it "
    'investors desirous of investing their money in companies, shareholders '
    'anxious to know financial position of companies they have invested in, '
    'banks or financial institutions willing to lend funds to credit-worthy '
    'organizations, governments desirous of collecting taxes from trade and '
    'industry in accordance with applicable laws, trade unions negotiating '
    'with corporate managements for better wages or insurance companies '
    'wanting to settle property claims caused by fire or other disasters - '
    'range of diverse users in equally diverse fields rely upon audited '
    'financial statements.\n'
    'Can you figure out reason behind such reliance? It is due to the fact '
    'that audited financial statements provide

In [23]:
CA_Inter_P5_C1 = text_blocks
pprint(object=CA_Inter_P5_C1)

{1: 'CHAPTER 1 -  NATURE, OBJECTIVE AND SCOPE OF ADULT\n'
    'INTRODUCTION\n'
    'What do such real-life situations highlight? Such instances underline '
    "importance of auditing in today's complex business environment. Be it "
    'investors desirous of investing their money in companies, shareholders '
    'anxious to know financial position of companies they have invested in, '
    'banks or financial institutions willing to lend funds to credit-worthy '
    'organizations, governments desirous of collecting taxes from trade and '
    'industry in accordance with applicable laws, trade unions negotiating '
    'with corporate managements for better wages or insurance companies '
    'wanting to settle property claims caused by fire or other disasters - '
    'range of diverse users in equally diverse fields rely upon audited '
    'financial statements.\n'
    'Can you figure out reason behind such reliance? It is due to the fact '
    'that audited financial statements provide

In [24]:
from pprint import pprint

file_path = (
    "CA-Inter-P5 - Chapter 2 - Audit Strategy, Audit Planning and Audit Programme.docx"
)
CA_Inter_P5_C2: dict = extract_text_blocks(file_path=file_path)
pprint(object=CA_Inter_P5_C2)

{1: 'CHAPTER 2\n'
    'AUDIT STRATEGY, AUDIT PLANNING AND AUDIT PROGRAMME\n'
    "AUDITOR'S RESPONSIBILITY TO PLAN AN AUDIT OF FINANCIAL STATEMENTS\n"
    'SA 300- Planning an audit of financial statements deals with the '
    "auditor's responsibility to plan an audit of financial statements. It "
    'states that objective of the auditor is to plan the audit so that it will '
    'be performed in an effective manner.\n'
    ' Why Planning an audit is necessary? - Its Benefits\n'
    'Planning an audit is necessary to carry out it effectively in a timely '
    'manner. Besides ensuring compliance with professional standards, it helps '
    'in performing audit engagement effectively.\n'
    'Adequate planning benefits the audit of financial statements in several '
    'ways, including the following: -\n'
    'Helping the auditor to devote appropriate attention to important areas of '
    'the audit.\n'
    'Helping the auditor identify and resolve potential problems on a timely '
    

## MongoDB 

In [27]:
from pymongo import MongoClient

client = MongoClient()
client.list_database_names()

['admin', 'config', 'courses', 'local', 'my_store', 'subrata', 'test']

In [28]:
courses_db = client.get_database("courses")

courses_db.list_collection_names()

['CA-P1-M1-C1-U1-Theory']

In [29]:
ca_inter_p2_c1_db = courses_db.get_collection(
    name="CA-Inter-P5 - Chapter 1 - Nature, Objective and Scope of Audit"
)
ca_inter_p2_c1_db

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'courses'), 'CA-Inter-P5 - Chapter 1 - Nature, Objective and Scope of Audit')

In [35]:
# Convert text_blocks dictionary to a list of documents
CA_Inter_P5_C1_documents = [
    {"key": key, "value": value} for key, value in CA_Inter_P5_C1.items()
]

CA_Inter_P5_C1_documents


[{'key': 'title', 'value': ''},
 {'key': 1,
  'value': "CHAPTER 1 -  NATURE, OBJECTIVE AND SCOPE OF ADULT\nINTRODUCTION\nWhat do such real-life situations highlight? Such instances underline importance of auditing in today's complex business environment. Be it investors desirous of investing their money in companies, shareholders anxious to know financial position of companies they have invested in, banks or financial institutions willing to lend funds to credit-worthy organizations, governments desirous of collecting taxes from trade and industry in accordance with applicable laws, trade unions negotiating with corporate managements for better wages or insurance companies wanting to settle property claims caused by fire or other disasters - range of diverse users in equally diverse fields rely upon audited financial statements.\nCan you figure out reason behind such reliance? It is due to the fact that audited financial statements provide confidence to users of financial statements; a

In [36]:
ca_inter_p2_c1_db.insert_many(CA_Inter_P5_C1_documents)

InsertManyResult([ObjectId('67406591a2ed655c19ebe25b'), ObjectId('67406591a2ed655c19ebe25c'), ObjectId('67406591a2ed655c19ebe25d'), ObjectId('67406591a2ed655c19ebe25e'), ObjectId('67406591a2ed655c19ebe25f'), ObjectId('67406591a2ed655c19ebe260'), ObjectId('67406591a2ed655c19ebe261'), ObjectId('67406591a2ed655c19ebe262'), ObjectId('67406591a2ed655c19ebe263'), ObjectId('67406591a2ed655c19ebe264'), ObjectId('67406591a2ed655c19ebe265'), ObjectId('67406591a2ed655c19ebe266'), ObjectId('67406591a2ed655c19ebe267'), ObjectId('67406591a2ed655c19ebe268'), ObjectId('67406591a2ed655c19ebe269'), ObjectId('67406591a2ed655c19ebe26a'), ObjectId('67406591a2ed655c19ebe26b'), ObjectId('67406591a2ed655c19ebe26c'), ObjectId('67406591a2ed655c19ebe26d'), ObjectId('67406591a2ed655c19ebe26e'), ObjectId('67406591a2ed655c19ebe26f'), ObjectId('67406591a2ed655c19ebe270'), ObjectId('67406591a2ed655c19ebe271'), ObjectId('67406591a2ed655c19ebe272'), ObjectId('67406591a2ed655c19ebe273'), ObjectId('67406591a2ed655c19ebe2

In [None]:
# Convert text_blocks dictionary to a list of documents
CA_Inter_P5_C2_documents = [
    {"key": key, "value": value} for key, value in CA_Inter_P5_C2.items()
]

CA_Inter_P5_C2_documents

[{'key': 'title', 'value': ''},
 {'key': 1,
  'value': 'CHAPTER 2\nAUDIT STRATEGY, AUDIT PLANNING AND AUDIT PROGRAMME\nAUDITOR\'S RESPONSIBILITY TO PLAN AN AUDIT OF FINANCIAL STATEMENTS\nSA 300- Planning an audit of financial statements deals with the auditor\'s responsibility to plan an audit of financial statements. It states that objective of the auditor is to plan the audit so that it will be performed in an effective manner.\n Why Planning an audit is necessary? - Its Benefits\nPlanning an audit is necessary to carry out it effectively in a timely manner. Besides ensuring compliance with professional standards, it helps in performing audit engagement effectively.\nAdequate planning benefits the audit of financial statements in several ways, including the following: -\nHelping the auditor to devote appropriate attention to important areas of the audit.\nHelping the auditor identify and resolve potential problems on a timely basis.\nHelping the auditor properly organize and manage t

In [38]:
ca_inter_p2_c2_db = courses_db.get_collection(
    name="CA-Inter-P5 - Chapter 2 - Audit Strategy, Audit Planning and Audit Programme"
)
ca_inter_p2_c2_db


Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'courses'), 'CA-Inter-P5 - Chapter 2 - Audit Strategy, Audit Planning and Audit Programme')

In [None]:
ca_inter_p2_c2_db.insert_many(documents=CA_Inter_P5_C2_documents)

InsertManyResult([ObjectId('67406613a2ed655c19ebe277'), ObjectId('67406613a2ed655c19ebe278'), ObjectId('67406613a2ed655c19ebe279'), ObjectId('67406613a2ed655c19ebe27a'), ObjectId('67406613a2ed655c19ebe27b'), ObjectId('67406613a2ed655c19ebe27c'), ObjectId('67406613a2ed655c19ebe27d'), ObjectId('67406613a2ed655c19ebe27e'), ObjectId('67406613a2ed655c19ebe27f'), ObjectId('67406613a2ed655c19ebe280'), ObjectId('67406613a2ed655c19ebe281'), ObjectId('67406613a2ed655c19ebe282'), ObjectId('67406613a2ed655c19ebe283'), ObjectId('67406613a2ed655c19ebe284'), ObjectId('67406613a2ed655c19ebe285')], acknowledged=True)