# Embed Documents

In [273]:
from pathlib import Path


def list_docx_files(folder_path, file_extension=".docx"):
    """
    Lists all .docx files in the specified folder.

    Parameters:
        folder_path (str): The path to the folder.

    Returns:
        list: A list of .docx file names.
    """
    path = Path(folder_path)
    return [file.name for file in path.glob(f"*{file_extension}") if file.is_file()]

In [274]:
files = list_docx_files(folder_path="./ca-fo-p1-c1")
files

['CA-FO-P1-C1-Unit 6-Ashok.docx',
 'CA-FO-P1-C1-Unit 3-Ashok.docx',
 'CA-FO-P1-C1-Unit 4-Ashok.docx',
 'CA-FO-P1-C1-Unit 1-Ashok.docx',
 'CA-FO-P1-C1-Unit 2-Ashok.docx',
 'CA-FO-P1-C1-Unit 7-Ashok.docx',
 'CA-FO-P1-C1-Unit 5-Ashok.docx']

In [276]:
json_files = list_docx_files(folder_path="./generated_data", file_extension=".json")
json_files

['CA-FO-P1-C1-U4-Contingents Assets and Contingent Liabilities.json',
 'CA-FO-P1-C1-U5-Accounting Policies.json',
 'CA-FO-P1-C1-U1- Meaning and Scope of Accounting.json',
 'CA-FO-P1-C1-U2-Accounting Concepts, Principles and Conventions.json',
 'CA-FO-P1-C1-U3-Capital and Revenue Expenditures and Receipts.json',
 'CA-FO-P1-C1-U7-Accounting Standards.json',
 'CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates.json']

In [149]:
import docx


def split_docx(file_path):
    doc = docx.Document(file_path)
    full_text = "\n".join([para.text for para in doc.paragraphs])
    return full_text.split("****")

In [252]:
file_path = "./ca-fo-p1-c1/CA-FO-P1-C1-Unit 1-Ashok.docx"
file_path = "./ca-fo-p1-c1/CA-FO-P1-C1-Unit 6-Ashok.docx"
sections = split_docx(file_path=file_path)
sections = [section.strip() for section in sections]
print(len(sections))
print(sections)

17
['', 'CA-FO-P1-C1', 'CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates', 'MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations in accordance with logical o

In [253]:
sections.remove(sections[0])

In [254]:
print(len(sections))
print(sections)

16
['CA-FO-P1-C1', 'CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates', 'MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations in accordance with logical or ma

In [255]:
print(sections[0])

CA-FO-P1-C1


In [256]:
db_name = "CA-FOUNDATION"
collection_name = sections[0]
print(collection_name)
chapter_name = sections[1]
print(chapter_name)

sections.remove(collection_name)
sections.remove(chapter_name)

CA-FO-P1-C1
CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates


In [257]:
print(len(sections))
print(sections)

14
['MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations in accordance with logical or mathematical rules - [A Dictionary of Accountant].\nOrdinal numbers, or ordinals, are numbers used to denote the posit

In [258]:
def create_documents(file_path: str):
    sections = split_docx(file_path=file_path)
    sections = [section.strip() for section in sections]
    sections.remove(sections[0])
    course_name = sections[0]
    sections.remove(course_name)
    chapter_name = sections[0]
    sections.remove(chapter_name)
    return course_name, chapter_name, sections

In [259]:
course_name, chapter_name, sections = create_documents(file_path=file_path)
print(course_name)
print(chapter_name)
print(len(sections), sections)

CA-FO-P1-C1
CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates
14 ['MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations in accordance with logical or mathemat

In [260]:
from pymongo import MongoClient
import os
from dotenv import load_dotenv

load_dotenv()

True

In [261]:
MONGODB_CONNECTION_STRING: str | None = os.getenv(
    key="MONGODB_CONNECTION_STRING", default=None
)

client = MongoClient(
    host=MONGODB_CONNECTION_STRING,
    tls=True,
    tlsAllowInvalidCertificates=True,
)

In [262]:
coll = client[db_name][collection_name]
coll.find_one(filter={})

In [263]:
client.list_database_names()

['CA-FOUNDATION',
 'Education',
 'ecommerce',
 'embeddings',
 'lrs',
 'meramaster',
 'mmdev',
 'tanuja',
 'test',
 'updated',
 'your_database_name',
 'admin',
 'local']

In [264]:
from langchain_openai import OpenAIEmbeddings

# Embedding
OPENAI_API_KEY: str | None = os.getenv(key="OPENAI_API_KEY", default=None)

embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=3072,
    disallowed_special=(),
)

In [265]:
# Generate embeddings
embeddings = embedding_model.embed_documents(texts=sections)

print(len(embeddings))
print(embeddings)

14
[[-0.012976067140698433, 0.04440698400139809, -0.014970536343753338, 0.0144779272377491, 0.0014155027456581593, -0.02171088196337223, -0.004607705399394035, 0.004817965906113386, -0.026817206293344498, -0.025759896263480186, 0.05680633708834648, 0.045127879828214645, -0.021446555852890015, -0.01102364994585514, 0.013192335143685341, 0.008272242732346058, -0.019848577678203583, -0.007899781689047813, 0.02823496051132679, 0.03945685550570488, -0.02804272249341011, 0.0006371639319695532, 0.027946604415774345, -0.009071231819689274, -0.011996855027973652, 0.016003815457224846, 0.0188152976334095, -0.024137888103723526, -0.02388557605445385, 0.03731820732355118, 0.01795022562146187, 0.022852296009659767, -0.02825899049639702, -0.031214650720357895, 0.009077239781618118, -0.05478784069418907, 0.0018052352825179696, -0.007250978145748377, 0.016868887469172478, 0.017697913572192192, -0.01647239550948143, 0.04637742415070534, -0.03803909942507744, 0.011732527054846287, -0.0206175297498703, -

In [266]:
# Create MongoDB Documents
docs = []
for i in range(len(sections)):
    docs.append(
        {
            "course": course_name,
            "chapter": chapter_name,
            "text": sections[i],
            "embedding": embeddings[i],
        }
    )

In [267]:
docs

[{'course': 'CA-FO-P1-C1',
  'chapter': 'CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates',
  'text': 'MEANING OF MEASUREMENT\nMeasurement is vital aspect of accounting. Primarily transactions and events are measured in terms of money. Any measurement discipline deals with three basic elements of measurement viz., identification of objects and events to be measured, selection of standard or scale to be used, and evaluation of dimension of measurement standards or scale.\nProf. R. J. Chambers defined \'measurement\' as "assignment of numbers to objects and events according to rules specifying the property to be measured, the scale to be used and the dimension of the unit". (R.J. Chambers, Accounting Evaluation and Economic Behaviour, Prentice Hall, Englewood Cliffs, N.J. 1966, P.10).\nKohler defined measurement as the assignment of a system of ordinal or cardinal numbers to the results of a scheme of inquiry or apparatus of observations i

In [268]:
import json
import os


def save_list_of_dicts_to_json(data, folder_path, filename):
    # Ensure the folder exists
    os.makedirs(folder_path, exist_ok=True)

    # Construct the full file path
    file_path = os.path.join(folder_path, filename)

    # Write the data to a JSON file
    with open(file_path, "w") as json_file:
        json.dump(data, json_file, indent=4)

    print(f"JSON file saved successfully at: {file_path}")


save_list_of_dicts_to_json(
    data=docs, folder_path="./generated_data", filename=f"{chapter_name}.json"
)

JSON file saved successfully at: ./generated_data/CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates.json


In [282]:
import json


def load_json_to_docs(file_path):
    """
    Load JSON data from a file into a variable named docs.

    :param file_path: Path to the JSON file
    :return: The loaded JSON data
    """
    try:
        with open(file_path, "r") as json_file:
            docs = json.load(json_file)
        print(f"JSON file loaded successfully from: {file_path}")
        return docs
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in file at {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None

In [290]:
json_files

['CA-FO-P1-C1-U4-Contingents Assets and Contingent Liabilities.json',
 'CA-FO-P1-C1-U5-Accounting Policies.json',
 'CA-FO-P1-C1-U1- Meaning and Scope of Accounting.json',
 'CA-FO-P1-C1-U2-Accounting Concepts, Principles and Conventions.json',
 'CA-FO-P1-C1-U3-Capital and Revenue Expenditures and Receipts.json',
 'CA-FO-P1-C1-U7-Accounting Standards.json',
 'CA-FO-P1-C1-U6-Accounting as a Measurements Discipline-Valuation Principles, Accounting Estimates.json']

In [295]:
db = client.get_database(name="embeddings")
coll = db.get_collection(name=course_name)
coll

Collection(Database(MongoClient(host=['mmdev-shard-00-00.z7q8g.mongodb.net:27017', 'mmdev-shard-00-01.z7q8g.mongodb.net:27017', 'mmdev-shard-00-02.z7q8g.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-b4um4g-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True), 'embeddings'), 'CA-FO-P1-C1')

In [296]:
import time

for file in json_files:
    docs = load_json_to_docs(file_path=f"./generated_data/{file}")
    try:
        coll.insert_many(documents=docs)
        print(f"{len(docs)} Documents inserted successfully")
    except Exception as e:
        print(f"An error occurred during insertion: {e}")

    print("Documents embedded and inserted successfully")
    time.sleep(3)

JSON file loaded successfully from: ./generated_data/CA-FO-P1-C1-U4-Contingents Assets and Contingent Liabilities.json
5 Documents inserted successfully
Documents embedded and inserted successfully
JSON file loaded successfully from: ./generated_data/CA-FO-P1-C1-U5-Accounting Policies.json
4 Documents inserted successfully
Documents embedded and inserted successfully
JSON file loaded successfully from: ./generated_data/CA-FO-P1-C1-U1- Meaning and Scope of Accounting.json
28 Documents inserted successfully
Documents embedded and inserted successfully
JSON file loaded successfully from: ./generated_data/CA-FO-P1-C1-U2-Accounting Concepts, Principles and Conventions.json
29 Documents inserted successfully
Documents embedded and inserted successfully
JSON file loaded successfully from: ./generated_data/CA-FO-P1-C1-U3-Capital and Revenue Expenditures and Receipts.json
6 Documents inserted successfully
Documents embedded and inserted successfully
JSON file loaded successfully from: ./generat

In [None]:
try:
    coll.insert_many(documents=docs)
    print(f"{len(docs)} Documents inserted successfully")
except Exception as e:
    print(f"An error occurred during insertion: {e}")

print("Documents embedded and inserted successfully")

In [104]:
semantic_queries: list[str] = [
    "What is BOOK-KEEPING?",
    "What is BOOK-KEEPING? CA-FO-P1-C1-U1",
    "What is BOOK-KEEPING? CA-FO-P1-C1-U7",
]

In [96]:
# Check if vector search index already exists
existing_indexes = list(coll.list_search_indexes())
existing_indexes

[{'id': '67824e138335db51b294c25c',
  'name': 'ca_foundation_index',
  'type': 'vectorSearch',
  'status': 'PENDING',
  'queryable': False,
  'latestDefinitionVersion': {'version': 0,
   'createdAt': datetime.datetime(2025, 1, 11, 10, 55, 15, 380000)},
  'latestDefinition': {'fields': [{'type': 'vector',
     'path': 'embedding',
     'similarity': 'dotProduct',
     'numDimensions': 3072}]},
  'statusDetail': []}]

In [132]:
vector_search_index = "ca_foundation_index"
try:
    coll.drop_search_index(name=vector_search_index)
    print(f"Search index '{vector_search_index}' deleted successfully.")
except Exception as e:
    print(f"Error deleting search index: {e}")

Search index 'ca_foundation_index' deleted successfully.


In [138]:
try:
    indexes = list(coll.list_search_indexes())
    print("Existing search indexes:")
    for index in indexes:
        print(index)
except Exception as e:
    print(f"Error listing search indexes: {e}")


Existing search indexes:
{'id': '6782540e02217146c28c9c03', 'name': 'ca_foundation_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2025, 1, 11, 11, 20, 46, 627000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'path': 'embedding', 'similarity': 'dotProduct', 'numDimensions': 3072}]}, 'statusDetail': [{'hostname': 'atlas-b4um4g-shard-00-02', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2025, 1, 11, 11, 20, 46)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 3072, 'similarity': 'dotProduct'}]}}}, {'hostname': 'atlas-b4um4g-shard-00-00', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2025, 1, 11, 11, 20, 46)}, 'definition': {'fields':

In [137]:
# from pymongo.operations import SearchIndexModel

# vector_search_index = "ca_foundation_index"
# # Define search index model
# search_index_model = SearchIndexModel(
#     definition={
#         "fields": [
#             {
#                 "type": "vector",
#                 "path": "embedding",
#                 "similarity": "dotProduct",
#                 "numDimensions": 3072,
#             }
#         ]
#     },
#     name=vector_search_index,
#     type="vectorSearch",
# )

# coll.create_search_index(model=search_index_model)
# print("Vector search index created successfully.")

OperationFailure: The maximum number of FTS indexes has been reached for this instance size., full error: {'ok': 0.0, 'errmsg': 'The maximum number of FTS indexes has been reached for this instance size.', 'code': 1, 'codeName': 'InternalError', '$clusterTime': {'clusterTime': Timestamp(1736594597, 27), 'signature': {'hash': b'\x85\xf5\x9f\xefh)\xc1\xb0a2\xdd\\\xd0\xfc\xbb\x9bo\xbfGa', 'keyId': 7416969346901082113}}, 'operationTime': Timestamp(1736594597, 27)}

In [None]:
# # Delete all documents in the collection
# coll.delete_many(filter={})

In [139]:
from langchain_mongodb import MongoDBAtlasVectorSearch

# Initialize vector search
vector_search = MongoDBAtlasVectorSearch(
    collection=coll,
    embedding=embedding_model,
    index_name=vector_search_index,
)


In [140]:
vector_search_index


'ca_foundation_index'

In [141]:
import pprint

# Perform semantic search
for query in semantic_queries:
    result = vector_search.similarity_search_with_score(
        query=query,
        k=5,
    )
    print("SEMANTIC QUERY:", query)
    print("RANKED RESULTS:")
    pprint.pprint(result)
    print("\n")


SEMANTIC QUERY: What is BOOK-KEEPING?
RANKED RESULTS:
[(Document(metadata={'_id': '67824e0687e6f40d79a0b490', 'chapter': '\nCA-FO-P1-C1-U1- Meaning and Scope of Accounting\n'}, page_content="\nBOOK-KEEPING\nBook-keeping is an activity concerned with the recording of financial data relating to business operations in a significant and orderly manner. It covers procedural aspects of accounting work and embraces record keeping function. Obviously, book-keeping procedures are governed by the end product, the financial statements. The term 'financial statements' means Profit and Loss Account, Balance Sheet and cash flow statements including Schedules and Notes forming part of Accounts.\nBook-keeping also requires suitable classification of transactions and events. This is also determined with reference to the requirement of financial statements. A book-keeper may be responsible for keeping all the records of a business or only of a minor segment, such as position of the customers' accounts i