In [3]:
#!pip install -U langchain langchain-community faiss-cpu langchain-ollama python-dotenv docling langchain-docling


In [4]:
# Environment setup
from dotenv import load_dotenv
import os
import warnings
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

True

In [6]:
from docling.document_converter import DocumentConverter

def load_and_convert_document(file_path):
    converter = DocumentConverter()
    result = converter.convert(file_path)
    return result.document.export_to_markdown()

source = "C:/Users/sanju/Documents/DeepSeek R1-1.5/Google PDF.pdf"
markdown_content = load_and_convert_document(source)

In [15]:
MarkdownHeaderTextSplitter?

[1;31mInit signature:[0m
[0mMarkdownHeaderTextSplitter[0m[1;33m([0m[1;33m
[0m    [0mheaders_to_split_on[0m[1;33m:[0m [1;34m'List[Tuple[str, str]]'[0m[1;33m,[0m[1;33m
[0m    [0mreturn_each_line[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mstrip_headers[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m      Splitting markdown files based on specified headers.
[1;31mInit docstring:[0m
Create a new MarkdownHeaderTextSplitter.

Args:
    headers_to_split_on: Headers we want to track
    return_each_line: Return each line w/ associated headers
    strip_headers: Strip split headers from the content of the chunk
[1;31mFile:[0m           c:\users\sanju\anaconda3\lib\site-packages\langchain_text_splitters\markdown.py
[1;31mType:[0m           type
[1;31mSubclasses:[0m     

In [8]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

def get_markdown_splits(markdown_content):
    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    return markdown_splitter.split_text(markdown_content)


chunks = get_markdown_splits(markdown_content)

# Embedding and vector store setup

In [10]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore


# Embedding and vector store setup
def setup_vector_store(chunks):
    embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
    single_vector = embeddings.embed_query("this is some text data")
    index = faiss.IndexFlatL2(len(single_vector))
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={}
    )
    vector_store.add_documents(documents=chunks)
    return vector_store

In [11]:
vector_store = setup_vector_store(chunks)
# Setup retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 3})

In [12]:
vector_store.index.ntotal, len(chunks)


(159, 159)

In [13]:
docs = retriever.invoke('what is revenue for september 2024?')


# RAG Pipeline


In [26]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def create_rag_chain(retriever):
    prompt = """
        You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
        ### Question: {question} 
        
        ### Context: {context} 
        
        ### Answer:
    """
    model = ChatOllama(model="deepseek-r1:1.5b", base_url="http://localhost:11434")
    prompt_template = ChatPromptTemplate.from_template(prompt)


    chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt_template
        | model
        | StrOutputParser()
    )
    return chain

In [28]:
# Load the Document 
source = "C:/Users/sanju/Documents/DeepSeek R1-1.5/Google PDF.pdf"
markdown_content = load_and_convert_document(source)
chunks = get_markdown_splits(markdown_content)

# Create vector store 
vector_store = setup_vector_store(chunks)
# Setup retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 3})

# RAG chain 
rag_chain = create_rag_chain(retriever)

In [29]:
Query = "How much revenue is generated by Google?"

print(Query)
for chunks in rag_chain.stream(Query): 
    print(chunks, end='', flush= True)
print('\n'+ '-'*50 + '\n')

How much revenue is generated by Google?
<think>
Okay, so I need to figure out how much revenue Google makes based on the context provided. Let me read through it carefully.

The context starts by talking about the "Revenue Backlog" which is $86.8 billion as of September 30, 2024. This backlog is from customer contracts that haven't been recognized yet for their services. The revenue is deferred because it's tied to when customers use Google's services.

Next, there are details about Google's operations: they were incorporated in California and later moved to Delaware. In 2015, they reorganized as Alphabet Inc., which means Alphabet becomes the primary issuer now. They generate revenue through delivering online advertising, cloud-based solutions that provide infrastructure and platform services to enterprises, communication tools, and sales of other products like subscriptions, apps, in-app purchases, and devices.

Hmm, but wait a minute. The context mentions "revenue backlog" specific