# Extractor

## Extract Text Blocks using ****

In [178]:
from docx import Document
import re

In [179]:
text_blocks: dict = {}

In [180]:
file_path = "CA-P1-M1-C1-U1-Theory.docx"

In [181]:
dir(Document())

['_Document__body',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_block_width',
 '_body',
 '_element',
 '_parent',
 '_part',
 'add_heading',
 'add_page_break',
 'add_paragraph',
 'add_picture',
 'add_section',
 'add_table',
 'core_properties',
 'element',
 'inline_shapes',
 'iter_inner_content',
 'paragraphs',
 'part',
 'save',
 'sections',
 'settings',
 'styles',
 'tables']

In [182]:
help(request=Document)

Help on function Document in module docx.api:

Document(docx: 'str | IO[bytes] | None' = None) -> 'DocumentObject'
    Return a |Document| object loaded from `docx`, where `docx` can be either a path
    to a ``.docx`` file (a string) or a file-like object.

    If `docx` is missing or ``None``, the built-in default document "template" is
    loaded.



In [183]:
doc: Document = Document(docx=file_path)

In [184]:
sections = doc.sections
type(sections)

docx.section.Sections

In [185]:
doc.paragraphs[0].text

'UNIT - 1 MEANING AND SCOPE OF ACCOUNTING'

In [186]:
list(doc.paragraphs)

[<docx.text.paragraph.Paragraph at 0x10acee750>,
 <docx.text.paragraph.Paragraph at 0x10acee990>,
 <docx.text.paragraph.Paragraph at 0x10acee960>,
 <docx.text.paragraph.Paragraph at 0x10acee7e0>,
 <docx.text.paragraph.Paragraph at 0x10aced6d0>,
 <docx.text.paragraph.Paragraph at 0x10aced130>,
 <docx.text.paragraph.Paragraph at 0x10aced520>,
 <docx.text.paragraph.Paragraph at 0x10aced5b0>,
 <docx.text.paragraph.Paragraph at 0x10aced760>,
 <docx.text.paragraph.Paragraph at 0x10acee4e0>,
 <docx.text.paragraph.Paragraph at 0x10aced5e0>,
 <docx.text.paragraph.Paragraph at 0x10acecf20>,
 <docx.text.paragraph.Paragraph at 0x10acee060>,
 <docx.text.paragraph.Paragraph at 0x10acecfb0>,
 <docx.text.paragraph.Paragraph at 0x10aced970>,
 <docx.text.paragraph.Paragraph at 0x10acee330>,
 <docx.text.paragraph.Paragraph at 0x10acee930>,
 <docx.text.paragraph.Paragraph at 0x10aced550>,
 <docx.text.paragraph.Paragraph at 0x10aced9a0>,
 <docx.text.paragraph.Paragraph at 0x10acedfd0>,
 <docx.text.paragrap

In [187]:
paragraphs: list = []
for paragraph in doc.paragraphs:
    paragraphs.append(paragraph.text)

In [188]:
len(paragraphs)

225

In [189]:
paragraphs[0]

'UNIT - 1 MEANING AND SCOPE OF ACCOUNTING'

In [190]:
def extract_text_blocks(file_path) -> dict:
    """
    Extracts text blocks from a .docx file separated by '****' delimiters.

    Args:
        file_path (str): The path to the .docx file.

    Returns:
        dict: A dictionary containing the extracted text blocks.
    """

    # Initialize an empty dictionary to store the extracted text blocks
    text_blocks: dict = {}

    # Open the .docx file
    doc: Document = Document(docx=file_path)

    # Extract the text from the .docx file
    full_text: list = []
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Join the extracted text into a single string
    full_text = "\n".join(full_text)

    # Split the full text into blocks based on the '****' delimiter
    blocks = full_text.split("****")

    # The first block is the title
    text_blocks["title"] = blocks[0].strip()

    # Process the remaining blocks
    for i in range(1, len(blocks), 2):
        # Split the block text by newline and get the first line
        first_line = blocks[i].strip().split("\n")[0][:50]

        # Ensure the key starts with a digit, otherwise skip non-digit characters
        key: str = re.sub(pattern=r"^\D+", repl="", string=first_line).strip()

        # The value is the block itself
        value = blocks[i].strip()
        # Add the key-value pair to the dictionary
        if key:  # Only add if key is not empty
            text_blocks[key] = value

    return text_blocks

In [191]:
file_path = "CA-P1-M1-C1-U1-Theory.docx"
text_blocks: dict = extract_text_blocks(file_path=file_path)
print(text_blocks)

{'title': 'UNIT - 1 MEANING AND SCOPE OF ACCOUNTING', '1.1 INTRODUCTION': "O) 1.1 INTRODUCTION\nEvery individual performs some kind of economic activity. A salaried person gets salary and spends to buy provisions and clothing, for children's education, construction of house, etc. A sports club formed by a group of individuals, a business run by an individual or a group of individuals, a company running a business in telecom sector, a local authority like Calcutta Municipal Corporation, Delhi Development Authority, Governments, either Central or State, all are carrying some kind of economic activities. Not necessarily all the economic activities are run for any individual benefit; such economic activities may create social benefit i.e. benefit for the public, at large. Anyway, such economic activities are performed through 'transactions and events'. Transaction is used to mean 'a business, performance of an act, an agreement' while event is used to mean 'a happening, as a consequence of

In [192]:
list(text_blocks.keys())

['title',
 '1.1 INTRODUCTION',
 '1.2.1 Procedural aspects of Accounting',
 '1.3 EVOLUTION OF ACCOUNTING AS A SOCIAL SCIENCE',
 '1.5 FUNCTIONS OF ACCOUNTING',
 '1.6.1 Objectives of Book-keeping',
 '1.8 SUB-FIELDS OF ACCOUNTING',
 '1.10 RELATIONSHIP OF ACCOUNTING WITH OTHER DISCIPL',
 '1.12.1 Areas of Service',
 '1.12.2 Chartered Accountant in Industry']

In [160]:
from pprint import pprint

pprint(object=text_blocks[list(text_blocks.keys())[1]])

('O) 1.1 INTRODUCTION\n'
 'Every individual performs some kind of economic activity. A salaried person '
 "gets salary and spends to buy provisions and clothing, for children's "
 'education, construction of house, etc. A sports club formed by a group of '
 'individuals, a business run by an individual or a group of individuals, a '
 'company running a business in telecom sector, a local authority like '
 'Calcutta Municipal Corporation, Delhi Development Authority, Governments, '
 'either Central or State, all are carrying some kind of economic activities. '
 'Not necessarily all the economic activities are run for any individual '
 'benefit; such economic activities may create social benefit i.e. benefit for '
 'the public, at large. Anyway, such economic activities are performed through '
 "'transactions and events'. Transaction is used to mean 'a business, "
 "performance of an act, an agreement' while event is used to mean 'a "
 "happening, as a consequence of transaction(s), a re

In [161]:
pprint(object=text_blocks[list(text_blocks.keys())[2]])

('1.2.1 Procedural aspects of Accounting\n'
 'On the basis of the above definitions, procedure of accounting can be '
 'basically divided into two parts:\n'
 '(i) Generating financial information and\n'
 '(ii) Using the financial information.\n'
 'Generating Financial Information\n'
 'Recording - This is the basic function of accounting. All business '
 'transactions of a financial character, as evidenced by some documents such '
 'as sales bill, pass book, salary slip etc. are recorded in the books of '
 'account. Recording is done in a book called "Journal." This book may further '
 'be divided into several subsidiary books according to the nature and size of '
 'the business. Students will learn how to prepare journal and various '
 'subsidiary books in chapter 2.\n'
 'Classifying - Classification is concerned with the systematic analysis of '
 'the recorded data, with a view to group transactions or entries of one '
 'nature at one place so as to put information in compact and usab

## Extract Text Blocks using Headings

In [144]:
# Initialize a dictionary to store content by headings and subheadings
content_by_section = {}
current_heading = None
current_subheading = None

# Debugging: Print the style of each paragraph to check style names
for para in doc.paragraphs:
    print(f"Text: {para.text}, Style: {para.style.name}")

# Iterate through all paragraphs in the document
for para in doc.paragraphs:
    # Check if the paragraph is a heading
    if para.style.name.startswith("Heading 1"):  # Main Heading
        current_heading = para.text.strip()
        content_by_section[current_heading] = {}
        current_subheading = None  # Reset subheading
    elif para.style.name.startswith("Heading 2"):  # Subheading
        if current_heading:
            current_subheading = para.text.strip()
            content_by_section[current_heading][current_subheading] = []
    else:
        # If it's normal text, add it under the current heading/subheading
        if current_heading and current_subheading:
            content_by_section[current_heading][current_subheading].append(
                para.text.strip()
            )
        elif current_heading:
            # If no subheading, add it directly under the main heading
            if current_heading not in content_by_section:
                content_by_section[current_heading] = []
            content_by_section[current_heading].append(para.text.strip())

# Print the structured content
for heading, subcontent in content_by_section.items():
    print(f"\nHeading: {heading}")
    if isinstance(subcontent, dict):
        for subheading, content in subcontent.items():
            print(f"  Subheading: {subheading}")
            for line in content:
                print(f"    {line}")
    else:
        for line in subcontent:
            print(f"  {line}")


Text: UNIT - 1 MEANING AND SCOPE OF ACCOUNTING, Style: Normal
Text: ****, Style: Normal
Text: O) 1.1 INTRODUCTION, Style: Normal
Text: Every individual performs some kind of economic activity. A salaried person gets salary and spends to buy provisions and clothing, for children's education, construction of house, etc. A sports club formed by a group of individuals, a business run by an individual or a group of individuals, a company running a business in telecom sector, a local authority like Calcutta Municipal Corporation, Delhi Development Authority, Governments, either Central or State, all are carrying some kind of economic activities. Not necessarily all the economic activities are run for any individual benefit; such economic activities may create social benefit i.e. benefit for the public, at large. Anyway, such economic activities are performed through 'transactions and events'. Transaction is used to mean 'a business, performance of an act, an agreement' while event is used to

## Connect with MongoDB

In [173]:
from pymongo import MongoClient

client = MongoClient(host="localhost", port=27017)
courses_db = client["courses"]

In [174]:
client.list_database_names()

['admin', 'config', 'local', 'my_store', 'subrata', 'test']

In [175]:
courses_db.list_collection_names()

[]

In [176]:
cap1_collection = courses_db["CA-P1-M1-C1-U1-Theory"]

cap1_collection


Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'courses'), 'CA-P1-M1-C1-U1-Theory')

In [194]:
type(text_blocks)

dict

In [196]:
# Convert text_blocks dictionary to a list of documents
documents = [{"key": key, "value": value} for key, value in text_blocks.items()]

documents


[{'key': 'title', 'value': 'UNIT - 1 MEANING AND SCOPE OF ACCOUNTING'},
 {'key': '1.1 INTRODUCTION',
  'value': "O) 1.1 INTRODUCTION\nEvery individual performs some kind of economic activity. A salaried person gets salary and spends to buy provisions and clothing, for children's education, construction of house, etc. A sports club formed by a group of individuals, a business run by an individual or a group of individuals, a company running a business in telecom sector, a local authority like Calcutta Municipal Corporation, Delhi Development Authority, Governments, either Central or State, all are carrying some kind of economic activities. Not necessarily all the economic activities are run for any individual benefit; such economic activities may create social benefit i.e. benefit for the public, at large. Anyway, such economic activities are performed through 'transactions and events'. Transaction is used to mean 'a business, performance of an act, an agreement' while event is used to 

In [198]:
cap1_collection.insert_many(documents=documents)

InsertManyResult([ObjectId('673b495cdebef72274e978ab'), ObjectId('673b495cdebef72274e978ac'), ObjectId('673b495cdebef72274e978ad'), ObjectId('673b495cdebef72274e978ae'), ObjectId('673b495cdebef72274e978af'), ObjectId('673b495cdebef72274e978b0'), ObjectId('673b495cdebef72274e978b1'), ObjectId('673b495cdebef72274e978b2'), ObjectId('673b495cdebef72274e978b3'), ObjectId('673b495cdebef72274e978b4')], acknowledged=True)

In [199]:
cap1_collection.find({})

<pymongo.synchronous.cursor.Cursor at 0x10a7d0170>