In [None]:
# # Langchain package
# %pip install -qU langchain
# 
# # Local vector store via Chroma
# %pip install -qU langchain_chroma
# 
# # Local inference and embeddings via Ollama
# %pip install -qU langchain_ollama
# 
# # Web Loader
# %pip install -qU beautifulsoup4

In [None]:
MODEL="llama3.2"

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain import hub
from langchain.document_loaders import PyPDFLoader
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader

In [None]:
anamoly_links=[
    "https://medium.com/simform-engineering/anomaly-detection-with-unsupervised-machine-learning-3bcf4c431aff",
    "https://www.stratascratch.com/blog/machine-learning-algorithms-explained-anomaly-detection/"
]

anamology_loader =WebBaseLoader(anamoly_links)

In [None]:
anamoly_langchain_docs = anamology_loader.load_and_split()
anamoly_langchain_docs

In [None]:
from langchain_core.documents import Document
from collections import defaultdict

#Group docs by source
grouped_docs = defaultdict(list)
for doc in anamoly_langchain_docs:
    source = doc.metadata.get('source', '')
    grouped_docs[source].append(doc)


In [None]:
for k,v in grouped_docs.items():
    print(f"[{k=}] {v=}")

In [None]:
# Combine documents with the same source
combined_docs = []
for source, docs in grouped_docs.items():
    combined_content = "\n".join(doc.page_content for doc in docs)
    combined_metadata={}
    # combined_metadata = docs[0].metadata.copy()  # Use metadata from the first document
    combined_metadata['num_chunks'] = len(docs)  # Add number of original chunks
    combined_docs.append(Document(page_content=combined_content, metadata=combined_metadata))

# Replace langchain_docs with the combined documents
langchain_docs = combined_docs

In [None]:
len(langchain_docs)

In [None]:
anamoly_doc_obj=langchain_docs[0]
anamoly_doc_obj

In [None]:
anamoly_doc_obj.page_content

In [None]:
from IPython.display import  Markdown
Markdown(anamoly_doc_obj.page_content)

In [None]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model=MODEL)

In [None]:
from langchain_chroma import Chroma

vectordb = Chroma.from_documents(langchain_docs, embedding=embeddings)
vectordb

In [None]:
retriever = vectordb.as_retriever()
retriever   

In [None]:
from langchain_ollama import ChatOllama

local_llm=ChatOllama(model=MODEL,temperature=0)
type(local_llm)

In [None]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

prompt

In [None]:
question_ans_chain = create_stuff_documents_chain(local_llm,prompt)

question_ans_chain

In [None]:
# query ="Do you know anything about anomaly?"
query ="How can you use anomaly deduction for monitoring an application latency?"

rag_chain = create_retrieval_chain(retriever,question_ans_chain)


In [None]:
results = rag_chain.invoke({"input" : query})
results

In [None]:
from IPython.display import Markdown

final_answer = results["answer"]
Markdown(final_answer)

In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ('system', system_prompt),
    ('human', '{input}')
])

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain_from_docs = (
        {
            'input': lambda x: x['input'],
            'context': lambda x: format_docs(x['context']),
        }
        | prompt
        | local_llm
        | StrOutputParser()
)

In [None]:
# passing the input query to the retriever
retrieve_docs = (lambda x: x['input']) | retriever

In [None]:
chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)
chain

In [None]:
query = "Tell me if anomaly deduction falls under machine learning?"
chain.invoke({'input': query})