# Libraries

In [1]:
%%time
!pip install langchain langchain_openai langchain_community langchain-chroma langsmith rank_bm25 -qU

CPU times: user 23.7 ms, sys: 2.17 ms, total: 25.8 ms
Wall time: 4.62 s


In [2]:
%%time
!pip install pypdf docx2txt wikipedia pymupdf -qU

CPU times: user 52.8 ms, sys: 8.03 ms, total: 60.9 ms
Wall time: 9 s


# Api Keys

In [3]:
# Load all the required api keys
from google.colab import userdata

LANGCHAIN_API_KEY = userdata.get('LANGCHAIN_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
OPENAI_API_KEY = userdata.get('openai_key')

# Langsmith Tracing

In [4]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain-subrata"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY

project_name = "langchain-subrata"

# Prettifier

In [5]:
def print_pretty(x):
  print("="*100)
  print()
  print(x)
  print()
  print("="*100)

# Data Loaders

## Wikipedia Loader

In [6]:
def load_from_wikipedia(query, lang="en", load_max_docs=2):
  """
  docs[0].metadata  # meta-information of the Document
  docs[0].page_content[:400]  # a content of the Document
  """
  from langchain_community.document_loaders import WikipediaLoader

  print("Loading from Wikipedia ...")
  docs = WikipediaLoader(query="Accounting", load_max_docs=2).load()
  print("Done!")
  return docs

In [7]:
%%time
docs = load_from_wikipedia("Accounting")
print_pretty(docs)

Loading from Wikipedia ...
Done!

[Document(metadata={'title': 'Accounting', 'summary': 'Accounting, also known as accountancy, is the process of recording and processing information about economic entities, such as businesses and corporations. Accounting measures the results of an organization\'s economic activities and conveys this information to a variety of stakeholders, including investors, creditors, management, and regulators. Practitioners of accounting are known as accountants. The terms "accounting" and "financial reporting" are often used interchangeably.\nAccounting can be divided into several fields including financial accounting, management accounting, tax accounting and cost accounting. Financial accounting focuses on the reporting of an organization\'s financial information, including the preparation of financial statements, to the external users of the information, such as investors, regulators and suppliers. Management accounting focuses on the measurement, analysis a



  lis = BeautifulSoup(html).find_all('li')


In [8]:
print_pretty(docs[0].metadata)
print_pretty(docs[0].page_content[:400])


{'title': 'Accounting', 'summary': 'Accounting, also known as accountancy, is the process of recording and processing information about economic entities, such as businesses and corporations. Accounting measures the results of an organization\'s economic activities and conveys this information to a variety of stakeholders, including investors, creditors, management, and regulators. Practitioners of accounting are known as accountants. The terms "accounting" and "financial reporting" are often used interchangeably.\nAccounting can be divided into several fields including financial accounting, management accounting, tax accounting and cost accounting. Financial accounting focuses on the reporting of an organization\'s financial information, including the preparation of financial statements, to the external users of the information, such as investors, regulators and suppliers. Management accounting focuses on the measurement, analysis and reporting of information for internal use by mana

## Pdf and Docx Loader

In [9]:
def load_document(file):
  import os

  print("Detecting file type ...")
  name, extension = os.path.splitext(file)
  print(f"{extension[1:]} file type detected!")

  print(f"Loading {file} ...")

  if extension == ".pdf":
    from langchain_community.document_loaders import PyPDFLoader

    loader = PyPDFLoader(file)
    pages = loader.load_and_split()

    print(f"{len(pages)} pages parsed successfully!")
    print("Done!")
    return pages

  elif extension == ".docx":
    from langchain_community.document_loaders import Docx2txtLoader

    loader = Docx2txtLoader(file)
    pages = loader.load_and_split()

    print(f"{len(pages)} pages parsed successfully!")
    print("Done!")
    return pages

  else:
    print(f"Unfortunately! {extension} is not supported")

# Load Data

Breaking down the entire document's pages into each individual page aka Document.

In [10]:
%%time
# Breaking down the entire document's pages into each individual page aka Document.
pages = load_document("/content/accounting.docx")
print_pretty(pages)

Detecting file type ...
docx file type detected!
Loading /content/accounting.docx ...
203 pages parsed successfully!
Done!

[Document(metadata={'source': '/content/accounting.docx'}, page_content='CHAPTER 1\n\nUNIT - 1 \n\nMEANING AND SCOPE OF ACCOUNTING\n\nINTRODUCTION\n\n\n\nEvery individual performs some kind of economic activity. A salaried person gets salary and spends to buy provisions and clothing, for children\'s education, construction of house, etc. A sports club formed by a group of individuals, a business run by an individual or a group of individuals, a company running a business in telecom sector, a local authority like Calcutta Municipal Corporation, Delhi Development Authority, Governments, either Central or State, all are carrying some kind of economic activities. Not necessarily all the economic activities are run for any individual benefit; such economic activities may create social benefit i.e. benefit for the public, at large. Anyway, such economic activities are p

In [11]:
# Display the first page
print_pretty(pages[0].page_content)


CHAPTER 1

UNIT - 1 

MEANING AND SCOPE OF ACCOUNTING

INTRODUCTION



Every individual performs some kind of economic activity. A salaried person gets salary and spends to buy provisions and clothing, for children's education, construction of house, etc. A sports club formed by a group of individuals, a business run by an individual or a group of individuals, a company running a business in telecom sector, a local authority like Calcutta Municipal Corporation, Delhi Development Authority, Governments, either Central or State, all are carrying some kind of economic activities. Not necessarily all the economic activities are run for any individual benefit; such economic activities may create social benefit i.e. benefit for the public, at large. Anyway, such economic activities are performed through 'transactions and events'. Transaction is used to mean 'a business, performance of an act, an agreement' while event is used to mean 'a happening, as a consequence of transaction(s), a resul

In [12]:
# Display the second page
print_pretty(pages[1].page_content)


Therefore, this requirement of communicating and motivating informed judgement has also become the part of accounting as defined in the widely accepted definition of accounting, given by the American Accounting Association in 1966 which treated accounting as:

"The process of identifying, measuring and communicating economic information to permit informed judgments and decisions by the users of accounts."

In 1970, the Accounting Principles Board (APB) of American Institute of Certified Public Accountants (AICPA) enumerated the functions of accounting as follows:

"The function of accounting is to provide quantitative information, primarily of financial nature, about economic entities, that is needed to be useful in making economic decisions."

Thus, accounting may be defined as the process of recording, classifying, summarising, analysing and interpreting the financial transactions and communicating the results thereof to the persons interested in such information.

The above definit

# Vector DB → Chroma DB

## Chunk Data Fn
* Chunk each Document Page with Langchain's `RecursiveCharacterTextSplitter`.

In [14]:
def chunk_data(data, chunk_size=256, chunk_overlap=0):
  from langchain_text_splitters import RecursiveCharacterTextSplitter

  print(f"Chunking ...")

  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size,
      chunk_overlap=chunk_overlap
  )
  chunks = text_splitter.split_documents(data)

  print(f"Total Chunks: {len(chunks)}.")
  average_chunk_length = sum([len(chunk.page_content) for chunk in chunks]) / len(chunks)
  print(f"Average chunk length for each chunk: {round(average_chunk_length, 2)} characters.")
  print("Done!")

  return chunks

## Chunk Data

In [15]:
%%time
chunks = chunk_data(data=pages, chunk_size=500, chunk_overlap=50)
print_pretty(chunks)

Chunking ...
Total Chunks: 2209.
Average chunk length for each chunk: 357.42 characters.
Done!

[Document(metadata={'source': '/content/accounting.docx'}, page_content='CHAPTER 1\n\nUNIT - 1 \n\nMEANING AND SCOPE OF ACCOUNTING\n\nINTRODUCTION'), Document(metadata={'source': '/content/accounting.docx'}, page_content="Every individual performs some kind of economic activity. A salaried person gets salary and spends to buy provisions and clothing, for children's education, construction of house, etc. A sports club formed by a group of individuals, a business run by an individual or a group of individuals, a company running a business in telecom sector, a local authority like Calcutta Municipal Corporation, Delhi Development Authority, Governments, either Central or State, all are carrying some kind of"), Document(metadata={'source': '/content/accounting.docx'}, page_content="Central or State, all are carrying some kind of economic activities. Not necessarily all the economic activities ar

## Chroma DB

In [16]:
%%time
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# OpenAI Embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key=OPENAI_API_KEY
  )

# Chroma DB
db = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="db"
)

print_pretty(db)


<langchain_chroma.vectorstores.Chroma object at 0x7c4396a940a0>

CPU times: user 14.2 s, sys: 661 ms, total: 14.9 s
Wall time: 21.2 s


## Similarity Search Fn

In [20]:
%%time
def similarity_search_with_score(query, k=2):
  results = db.similarity_search_with_score(query=query, k=k, where_document={"$contains": "procedural aspects"})
  sorted_results = sorted(results, key=lambda x: x[1], reverse=True)

  for result in sorted_results:
    print(result[1])
    print_pretty(result[0].page_content)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs


In [22]:
%%time
query="What are the procedural aspects of accounting?"
similarity_search_with_score(query, k=2)

0.922157347202301

BOOK-KEEPING

Book-keeping is an activity concerned with the recording of financial data relating to business operations in a significant and orderly manner. It covers procedural aspects of accounting work and embraces record keeping function. Obviously, book-keeping procedures are governed by the end product, the financial statements. The term 'financial statements' means Profit and Loss Account, Balance Sheet and cash flow statements including Schedules and Notes forming part of Accounts.

0.8754918575286865

Book-keeping is an activity concerned with the recording of financial data relating to business operations in a significant and orderly manner. It covers procedural aspects of accounting work and embraces record keeping function. Obviously, book-keeping procedures are governed by the end product, the financial statements. The term 'financial statements' means Profit and Loss Account, Balance Sheet and cash flow statements including Schedules and Notes forming 

# Retrievers

# Advanced Retrievers

## Context Enrichment Window for Document Retrieval

### Helper Fn

In [23]:
from langchain.document_loaders import  PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain import PromptTemplate
import fitz
from typing import List
from rank_bm25 import BM25Okapi
import asyncio
import random
import textwrap
import numpy as np





def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document.

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents


def text_wrap(text, width=120):
    """
    Wraps the input text to the specified width.

    Args:
        text (str): The input text to wrap.
        width (int): The width at which to wrap the text.

    Returns:
        str: The wrapped text.
    """
    return textwrap.fill(text, width=width)




def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)

    # Create embeddings and vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore

def encode_from_string(content):
    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    chunks = text_splitter.create_documents([content])

    for chunk in chunks:
        chunk.metadata['relevance_score'] = 1.0

    embeddings = OpenAIEmbeddings()

    vectorstore = FAISS.from_documents(chunks, embeddings)
    return vectorstore


def retrieve_context_per_question(question, chunks_query_retriever):
    """
    Retrieves relevant context and unique URLs for a given question using the chunks query retriever.

    Args:
        question: The question for which to retrieve context and URLs.

    Returns:
        A tuple containing:
        - A string with the concatenated content of relevant documents.
        - A list of unique URLs from the metadata of the relevant documents.
    """

    # Retrieve relevant documents for the given question
    docs = chunks_query_retriever.get_relevant_documents(question)

    # Concatenate document content
    # context = " ".join(doc.page_content for doc in docs)
    context = [doc.page_content for doc in docs]


    return context

class QuestionAnswerFromContext(BaseModel):
    """
    Model to generate an answer to a query based on a given context.

    Attributes:
        answer_based_on_content (str): The generated answer based on the context.
    """
    answer_based_on_content: str = Field(description="Generates an answer to a query based on a given context.")

def create_question_answer_from_context_chain(llm):

    # Initialize the ChatOpenAI model with specific parameters
    question_answer_from_context_llm = llm

    # Define the prompt template for chain-of-thought reasoning
    question_answer_prompt_template = """
    For the question below, provide a concise but suffice answer based ONLY on the provided context:
    {context}
    Question
    {question}
    """

    # Create a PromptTemplate object with the specified template and input variables
    question_answer_from_context_prompt = PromptTemplate(
        template=question_answer_prompt_template,
        input_variables=["context", "question"],
    )

    # Create a chain by combining the prompt template and the language model
    question_answer_from_context_cot_chain = question_answer_from_context_prompt | question_answer_from_context_llm.with_structured_output(QuestionAnswerFromContext)
    return question_answer_from_context_cot_chain



def answer_question_from_context(question, context, question_answer_from_context_chain):
    """
    Answer a question using the given context by invoking a chain of reasoning.

    Args:
        question: The question to be answered.
        context: The context to be used for answering the question.

    Returns:
        A dictionary containing the answer, context, and question.
    """
    input_data = {
        "question": question,
        "context": context
    }
    print("Answering the question from the retrieved context...")

    output = question_answer_from_context_chain.invoke(input_data)
    answer = output.answer_based_on_content
    return {"answer": answer, "context": context, "question": question}


def show_context(context):
    """
    Display the contents of the provided context list.

    Args:
        context (list): A list of context items to be displayed.

    Prints each context item in the list with a heading indicating its position.
    """
    for i, c in enumerate(context):
        print(f"Context {i+1}:")
        print(c)
        print("\n")


def read_pdf_to_string(path):
    """
    Read a PDF document from the specified path and return its content as a string.

    Args:
        path (str): The file path to the PDF document.

    Returns:
        str: The concatenated text content of all pages in the PDF document.

    The function uses the 'fitz' library (PyMuPDF) to open the PDF document, iterate over each page,
    extract the text content from each page, and append it to a single string.
    """
    # Open the PDF document located at the specified path
    doc = fitz.open(path)
    content = ""
    # Iterate over each page in the document
    for page_num in range(len(doc)):
        # Get the current page
        page = doc[page_num]
        # Extract the text content from the current page and append it to the content string
        content += page.get_text()
    return content



def bm25_retrieval(bm25: BM25Okapi, cleaned_texts: List[str], query: str, k: int = 5) -> List[str]:
    """
    Perform BM25 retrieval and return the top k cleaned text chunks.

    Args:
    bm25 (BM25Okapi): Pre-computed BM25 index.
    cleaned_texts (List[str]): List of cleaned text chunks corresponding to the BM25 index.
    query (str): The query string.
    k (int): The number of text chunks to retrieve.

    Returns:
    List[str]: The top k cleaned text chunks based on BM25 scores.
    """
    # Tokenize the query
    query_tokens = query.split()

    # Get BM25 scores for the query
    bm25_scores = bm25.get_scores(query_tokens)

    # Get the indices of the top k scores
    top_k_indices = np.argsort(bm25_scores)[::-1][:k]

    # Retrieve the top k cleaned text chunks
    top_k_texts = [cleaned_texts[i] for i in top_k_indices]

    return top_k_texts



async def exponential_backoff(attempt):
    """
    Implements exponential backoff with a jitter.

    Args:
        attempt: The current retry attempt number.

    Waits for a period of time before retrying the operation.
    The wait time is calculated as (2^attempt) + a random fraction of a second.
    """
    # Calculate the wait time with exponential backoff and jitter
    wait_time = (2 ** attempt) + random.uniform(0, 1)
    print(f"Rate limit hit. Retrying in {wait_time:.2f} seconds...")

    # Asynchronously sleep for the calculated wait time
    await asyncio.sleep(wait_time)

async def retry_with_exponential_backoff(coroutine, max_retries=5):
    """
    Retries a coroutine using exponential backoff upon encountering a RateLimitError.

    Args:
        coroutine: The coroutine to be executed.
        max_retries: The maximum number of retry attempts.

    Returns:
        The result of the coroutine if successful.

    Raises:
        The last encountered exception if all retry attempts fail.
    """
    for attempt in range(max_retries):
        try:
            # Attempt to execute the coroutine
            return await coroutine
        except RateLimitError as e:
            # If the last attempt also fails, raise the exception
            if attempt == max_retries - 1:
                raise e

            # Wait for an exponential backoff period before retrying
            await exponential_backoff(attempt)

    # If max retries are reached without success, raise an exception
    raise Exception("Max retries reached")

### Pdf Path

In [24]:
path = "/content/accounting.pdf"

### Convert Docx to Strings

In [25]:
def read_pdf_to_string(path):
    """
    Read a PDF document from the specified path and return its content as a string.

    Args:
        path (str): The file path to the PDF document.

    Returns:
        str: The concatenated text content of all pages in the PDF document.

    The function uses the 'fitz' library (PyMuPDF) to open the PDF document, iterate over each page,
    extract the text content from each page, and append it to a single string.
    """
    # Open the PDF document located at the specified path
    doc = fitz.open(path)
    content = ""
    # Iterate over each page in the document
    for page_num in range(len(doc)):
        # Get the current page
        page = doc[page_num]
        # Extract the text content from the current page and append it to the content string
        content += page.get_text()
    return content

In [26]:
%%time
content = read_pdf_to_string(path)
print_pretty(content)


CHAPTER 1 
UNIT - 1  
1.MEANING AND SCOPE OF ACCOUNTING 
1.INTRODUCTION 
Every individual performs some kind of economic activity. A salaried person gets 
salary and spends to buy provisions and clothing, for children's education, 
construction of house, etc. A sports club formed by a group of individuals, a 
business run by an individual or a group of individuals, a company running a 
business in telecom sector, a local authority like Calcutta Municipal Corporation, 
Delhi Development Authority, Governments, either Central or State, all are 
carrying some kind of economic activities. Not necessarily all the economic 
activities are run for any individual benefit; such economic activities may create 
social benefit i.e. benefit for the public, at large. Anyway, such economic activities 
are performed through 'transactions and events'. Transaction is used to mean 'a 
business, performance of an act, an agreement' while event is used to mean 'a 
happening, as a consequence of transactio

### Function to split text into chunks with metadata of the chunk chronological index

In [27]:
from langchain.docstore.document import Document

def split_text_to_chunks_with_indices(text: str, chunk_size: int, chunk_overlap: int) -> List[Document]:
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(Document(page_content=chunk, metadata={"index": len(chunks), "text": text}))
        start += chunk_size - chunk_overlap
    return chunks

### Split our document accordingly

In [28]:
%%time
chunks_size = 200
chunk_overlap = 100
docs = split_text_to_chunks_with_indices(content, chunks_size, chunk_overlap)
type(docs)

CPU times: user 58.8 ms, sys: 1.95 ms, total: 60.7 ms
Wall time: 61.1 ms


list

In [29]:
len(docs)

7710

### Create vector store and retriever

In [30]:
# embeddings = OpenAIEmbeddings()
# vectorstore = FAISS.from_documents(docs, embeddings)
# chunks_query_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [None]:
%%time
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# OpenAI Embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key=OPENAI_API_KEY
  )

# Chroma DB
db = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory="advanced-rag-db"
)

chunks_query_retriever = db.as_retriever(search_kwargs={"k": 2})
type(chunks_query_retriever)