In [18]:
from pathlib import Path

import docx


# Function to list all .docx files in a given folder
def list_docx_files(folder_path, file_extension=".docx"):
    path = Path(folder_path)
    # Using glob to find files that match the pattern *[file_extension]
    return [file.name for file in path.glob(f"*{file_extension}") if file.is_file()]


# Example: Listing all .docx files in the folder "./ca-fo-p1/Chapter 2"
files = list_docx_files(folder_path="./ca-fo-p1/Chapter 2")
print("List of .docx files:", len(files), files)

List of .docx files: 6 ['CA-FO-P1-C2-Unit 1-Ashok.docx', 'CA-FO-P1-C2-Unit 4-Ashok.docx', 'CA-FO-P1-C2-Unit 3-Ashok.docx', 'CA-FO-P1-C2-Unit 6-Ashok.docx', 'CA-FO-P1-C2-Unit 5-Ashok.docx', 'CA-FO-P1-C2-Unit 2-Ashok.docx']


In [21]:
# Function to read a .docx file, join paragraph texts, and split it by "****"
def split_docx(file_path):
    doc = docx.Document(file_path)
    # Join all paragraphs with newline character
    full_text = "\n".join([para.text for para in doc.paragraphs])
    # Split the full text by the separator "****"
    return full_text.split("****")


# Specify the DOCX file to work on
file_path = "./ca-fo-p1/Chapter 2/CA-FO-P1-C2-Unit 3-Ashok.docx"
# Split the document into sections using the defined function
sections = split_docx(file_path=file_path)
# Remove any leading/trailing whitespace from each section
sections = [section.strip() for section in sections]

# Debug prints to check the number of sections and their content
print("Number of sections found:", len(sections))
print("Sections:", sections)


Number of sections found: 11
Sections: ['', 'CA-FO-P1-C2', 'CA-FO-P1-C2-U3-Trial Balance', 'Introduction for Trail Balance \nPreparation of trial balance is the third phase in the accounting process. After posting the accounts in the ledger, a statement is prepared to show separately the debit and credit balances. Such a statement is known as the trial balance. It may also be prepared by listing each and every account and entering in separate columns the totals of the debit and credit sides. Whichever way it is prepared, the totals of the two columns should agree. An agreement indicates arithmetic accuracy of the accounting work; if the two sides do not agree, then there is simply an arithmetic error(s).\nThis follows from the fact that under the Double Entry System, the amount written on the debit sides of various accounts is always equal to the amounts entered on the credit sides of other accounts and vice versa. Hence the totals of the debit sides must be equal to the totals of the 

In [78]:
def data_preparation_pipeline(paper_name, folder_path, file_extension=".docx"):
    files: list[str] = list_docx_files(
        folder_path=folder_path,
        file_extension=file_extension,
    )
    files = [f"{folder_path}/{file}" for file in files]
    print("List of .docx files:", len(files), files)
    print("*" * 100)
    multiple_sections: list[tuple[str, str, str, list[str]]] = []
    for file in files:
        # Split the document into sections using the defined function
        sections: list[str] = split_docx(file_path=file)
        # Remove any leading/trailing whitespace from each section
        sections = [section.strip() for section in sections if len(section) > 1]
        chapter_name: str = sections[0]
        unit_name: str = sections[1]
        sections.remove(chapter_name)
        sections.remove(unit_name)

        # Debug prints to check the number of sections and their content)
        print("List of sections per file:", file, len(sections), sections)
        print("Paper:", paper_name)
        print("Chapter:", chapter_name)
        multiple_sections.append(
            (
                paper_name,
                chapter_name,
                unit_name,
                sections,
            )
        )
        print("*" * 100)
    print(len(multiple_sections), multiple_sections)
    return multiple_sections


data: list[tuple[str, str, list[str]]] = data_preparation_pipeline(
    paper_name="CA-FO-P1",
    folder_path="./ca-fo-p1/Chapter 1",
    file_extension=".docx",
)

List of .docx files: 7 ['./ca-fo-p1/Chapter 1/CA-FO-P1-C1-Unit 6-Ashok.docx', './ca-fo-p1/Chapter 1/CA-FO-P1-C1-Unit 3-Ashok.docx', './ca-fo-p1/Chapter 1/CA-FO-P1-C1-Unit 4-Ashok.docx', './ca-fo-p1/Chapter 1/CA-FO-P1-C1-Unit 1-Ashok.docx', './ca-fo-p1/Chapter 1/CA-FO-P1-C1-Unit 2-Ashok.docx', './ca-fo-p1/Chapter 1/CA-FO-P1-C1-Unit 7-Ashok.docx', './ca-fo-p1/Chapter 1/CA-FO-P1-C1-Unit 5-Ashok.docx']
****************************************************************************************************
List of sections per file: ./ca-fo-p1/Chapter 1/CA-FO-P1-C1-Unit 6-Ashok.docx 13 ['MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measuremen

In [79]:
for datum in data:
    print(datum)

('CA-FO-P1', 'CA-FO-P1-C1', 'CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates', ['MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations in accordance with log

In [80]:
def mongodb_init():
    import os
    from dotenv import load_dotenv
    from pymongo import MongoClient

    load_dotenv()

    MONGODB_CONNECTION_STRING: str | None = os.getenv(
        key="MONGODB_CONNECTION_STRING", default=None
    )

    client = MongoClient(
        host=MONGODB_CONNECTION_STRING,
        tls=True,
        tlsAllowInvalidCertificates=True,
    )
    return client


client = mongodb_init()
client

MongoClient(host=['mmdev-shard-00-01.z7q8g.mongodb.net:27017', 'mmdev-shard-00-00.z7q8g.mongodb.net:27017', 'mmdev-shard-00-02.z7q8g.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-b4um4g-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True)

In [81]:
client.list_database_names()

['Education',
 'ecommerce',
 'embeddings',
 'lrs',
 'meramaster',
 'mm_ai',
 'mmdev',
 'subrata',
 'tanuja',
 'updated',
 'your_database_name',
 'admin',
 'local']

In [82]:
data[0]

('CA-FO-P1',
 'CA-FO-P1-C1',
 'CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates',
 ['MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations in accordance with 

In [83]:
ca_db = client.get_database(name="CA-FOUNDATION")
ca_db.list_collection_names()

[]

In [84]:
ca_fo_p1_coll = ca_db.get_collection(name=data[0][0])
ca_fo_p1_coll

Collection(Database(MongoClient(host=['mmdev-shard-00-01.z7q8g.mongodb.net:27017', 'mmdev-shard-00-00.z7q8g.mongodb.net:27017', 'mmdev-shard-00-02.z7q8g.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-b4um4g-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True), 'CA-FOUNDATION'), 'CA-FO-P1')

In [112]:
def embeddingmodel_init():
    import os
    from dotenv import load_dotenv
    from langchain_openai import OpenAIEmbeddings

    load_dotenv()

    OPENAI_API_KEY: str | None = os.getenv(key="OPENAI_API_KEY", default=None)

    embedding_model = OpenAIEmbeddings(
        model="text-embedding-3-large",
        dimensions=3072,
        disallowed_special=(),
        api_key=OPENAI_API_KEY,
    )
    return embedding_model


embedding_model = embeddingmodel_init()
embedding_model

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x10b267d10>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x10ae0a750>, model='text-embedding-3-large', dimensions=3072, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=(), chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [89]:
data[0][3]

['MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations in accordance with logical or mathematical rules - [A Dictionary of Accountant].\nOrdinal numbers, or ordinals, are numbers used to denote the position

In [90]:
# Generate embeddings
embeddings = embedding_model.embed_documents(texts=data[0][3])

print(len(embeddings), embeddings)

13 [[-0.012689659371972084, 0.04457031935453415, -0.015157761052250862, 0.014579864218831062, 0.0011226856149733067, -0.02146647311747074, -0.004951254464685917, 0.004716483876109123, -0.027329720556735992, -0.02578866109251976, 0.05648944899439812, 0.04510005936026573, -0.02121364325284958, -0.0111064612865448, 0.013183279894292355, 0.00831329170614481, -0.01964850351214409, -0.007849769666790962, 0.0283651202917099, 0.03934516757726669, -0.0277872234582901, 0.0005150659126229584, 0.02793169766664505, -0.008794873021543026, -0.012045544572174549, 0.016012568026781082, 0.018841855227947235, -0.024151286110281944, -0.024030890315771103, 0.03725028783082962, 0.017686061561107635, 0.02277878113090992, -0.0286540687084198, -0.03108605183660984, 0.0091560585424304, -0.054659441113471985, 0.0018194731092080474, -0.007332070730626583, 0.01691553182899952, 0.01778237707912922, -0.016096845269203186, 0.046641118824481964, -0.038357924669981, 0.011762615293264389, -0.020840417593717575, -0.01121

In [99]:
data[0][3]

['MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations in accordance with logical or mathematical rules - [A Dictionary of Accountant].\nOrdinal numbers, or ordinals, are numbers used to denote the position

In [94]:
len(data[0][3])

13

In [105]:
def mongodb_document_with_embedding_pipeline(
    paper_name: str,
    chapter_name: str,
    unit_name: str,
    sections: list[str],
    embeddings: list[list[float]],
):
    docs: list[dict] = []
    for i in range(len(sections)):
        docs.append(
            {
                "paper": paper_name,
                "chapter": chapter_name,
                "unit": unit_name,
                "text": sections[i],
                "embedding": embeddings[i],
            }
        )
    print(len(docs), docs[0]["unit"])
    return docs


docs = mongodb_document_with_embedding_pipeline(
    paper_name=data[0][0],
    chapter_name=data[0][1],
    unit_name=data[0][2],
    sections=data[0][3],
    embeddings=embeddings,
)
len(docs)

13

In [109]:
docs[0]

{'paper': 'CA-FO-P1',
 'chapter': 'CA-FO-P1-C1',
 'unit': 'CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates',
 'text': 'MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus 

In [111]:
import json
import os


def save_embeddings_documents_to_json(data, folder_path, filename):
    # Ensure the folder exists
    os.makedirs(folder_path, exist_ok=True)

    # Construct the full file path
    file_path = os.path.join(folder_path, filename)

    # Write the data to a JSON file
    with open(file_path, "w") as json_file:
        json.dump(data, json_file, indent=4)

    print(f"JSON file saved successfully at: {file_path}")


save_embeddings_documents_to_json(
    data=docs, folder_path="./generated_embeddings", filename=f"{docs[0]["unit"]}.json"
)

JSON file saved successfully at: ./generated_embeddings/CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates.json


### Pipeline: Prepare Data, Create Embeddings and Save as Json

In [128]:
def get_subfolder_paths(folder_path):
    import os

    subfolder_paths = []

    # Check if the given path exists and is a directory
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        # Get the full path of the folder
        full_path = os.path.abspath(folder_path)

        # Get the last two parts of the folder path
        path_parts = full_path.split(os.sep)
        parent_path = path_parts[-1]
        print(parent_path)

        # Iterate through all items in the directory
        for item in os.listdir(folder_path):
            item_path = os.path.join(folder_path, item)
            # If the item is a directory, add its path to the list
            if os.path.isdir(item_path):
                subfolder_path = f"./{parent_path}/{item}"
                subfolder_paths.append(subfolder_path)

    return subfolder_paths


folder_path = "./ca-fo-p1"
subfolders = get_subfolder_paths(folder_path)
print(subfolders)


ca-fo-p1
['./ca-fo-p1/Chapter 7', './ca-fo-p1/Chapter 9', './ca-fo-p1/Chapter 8', './ca-fo-p1/Chapter 1', './ca-fo-p1/Chapter 6', './ca-fo-p1/Chapter 10', './ca-fo-p1/Chapter 11', './ca-fo-p1/Chapter 3', './ca-fo-p1/Chapter 4', './ca-fo-p1/Chapter 5', './ca-fo-p1/Chapter 2']


In [129]:
ca_fo_p1_coll

Collection(Database(MongoClient(host=['mmdev-shard-00-01.z7q8g.mongodb.net:27017', 'mmdev-shard-00-00.z7q8g.mongodb.net:27017', 'mmdev-shard-00-02.z7q8g.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-b4um4g-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True), 'CA-FOUNDATION'), 'CA-FO-P1')

In [131]:
ca_fo_p1_coll.count_documents(filter={})

0

# Data Ingestion Pipeline

In [135]:
# Function to list all .docx files in a given folder
def list_docx_files(folder_path, file_extension=".docx"):
    from pathlib import Path
    import docx

    path = Path(folder_path)
    # Using glob to find files that match the pattern *[file_extension]
    return [file.name for file in path.glob(f"*{file_extension}") if file.is_file()]


list_docx_files(folder_path="./generated_embeddings", file_extension=".json")

['CA-FO-P1-C10-U5- Death of a Partner.json',
 'CA-FO-P1-C2-U2-Ledgers.json',
 'CA-FO-P1-C7-U2-Final Accounts of Manufacturing Entities.json',
 'CA-FO-P1-C2-U3-Trial Balance.json',
 'CA-FO-P1-C2-U1-Basic Accounting Procedures-Journal Entries.json',
 'CA-FO-P1-C10-U6- Dissolution of Partnership Firms and LLP.json',
 'CA-FO-P1-C2-U5-Cash Book.json',
 'CA-FO-P1-C1-U4-Contingents Assets and Contingent Liabilities.json',
 'CA-FO-P1-C10-U3- Admission of a New Partner.json',
 'CA-FO-P1-C11-U3- Issue of Debentures.json',
 'CA-FO-P1-C1-U5-Accounting Policies.json',
 'CA-FO-P1-C5.json',
 'CA-FO-P1-C11-U4- Accounting for Bounce issue and Right Issue.json',
 'CA-FO-P1-C11-U5- Redemption of Preference Shares.json',
 'CA-FO-P1-C1-U1- Meaning and Scope of Accounting.json',
 'CA-FO-P1-C6-Bills of Exchange and Promissory Nots.json',
 'CA-FO-P1-C10-U4- Retirement of a Partner.json',
 'CA-FO-P1-C1-U2-Accounting Concepts, Principles and Conventions.json',
 'CA-FO-P1-C11-U6-Redemption of Debentures.json',
 

In [142]:
def load_json_to_mongodb_docs(file_path):
    import json

    """
    Load JSON data from a file into a variable named docs.

    :param file_path: Path to the JSON file
    :return: The loaded JSON data
    """
    try:
        with open(file_path, "r") as json_file:
            docs = json.load(json_file)
        print(f"JSON file loaded successfully from: {file_path}")
        return docs
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in file at {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None

In [143]:
file = load_json_to_mongodb_docs(
    file_path="./generated_embeddings/CA-FO-P1-C10-U5- Death of a Partner.json"
)

JSON file loaded successfully from: ./generated_embeddings/CA-FO-P1-C10-U5- Death of a Partner.json


In [144]:
import time

for file in json_files:
    docs = load_json_to_mongodb_docs(file_path=f"./generated_data/{file}")
    try:
        coll.insert_many(documents=docs)
        print(f"{len(docs)} Documents inserted successfully")
    except Exception as e:
        print(f"An error occurred during insertion: {e}")

    print("Documents embedded and inserted successfully")
    time.sleep(3)


NameError: name 'json_files' is not defined

In [140]:
client.list_database_names()

['Education',
 'ecommerce',
 'embeddings',
 'lrs',
 'meramaster',
 'mm_ai',
 'mmdev',
 'subrata',
 'tanuja',
 'updated',
 'your_database_name',
 'admin',
 'local']

In [145]:
def mongodb_embeddings_ingestion_pipeline(
    db_name: str,
    paper_name: str,
    json_folder_path: str,
):
    client = mongodb_init()
    db = client.get_database(name=db_name)
    coll = db.get_collection(name=paper_name)
    print(f"Collection: {coll}")
    print(f"Total Documents before Ingestion: {coll.count_documents(filter={})}")
    json_files = list_docx_files(folder_path=json_folder_path, file_extension=".json")
    print(f"Total Documents to be Ingested: {len(json_files)}")
    for file in json_files:
        docs = load_json_to_mongodb_docs(file_path=f"{json_folder_path}/{file}")
