In [None]:
#install necessary package 
#!pip install --upgrade langchain
#!pip install -U langchain-community
#!pip install -U langchain-huggingface
#!pip install sentence-transformers
#!pip install --upgrade --quiet  sqlite-vss
#!pip install --upgrade --quiet pypdf # for parsing PDF documents

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import SQLiteVSS
#from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader

In [None]:
# Load the document using a LangChain PyPDFLoader 
file_path_pdf="./ignitebook-sample.pdf"

loader = PyPDFLoader(file_path_pdf)
pages = loader.load_and_split()

print ("Page counts: ",  len(pages))


In [None]:
# Split the document into chunks, chunk size 1k
text_splitter = CharacterTextSplitter (chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(pages)
texts = [doc.page_content for doc in docs]

In [None]:
# Use the sentence transformer package with the all-MiniLM-L6-v2 embedding model
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Load the text embeddings in SQLiteVSS in a table named PyPDFLoader
db = SQLiteVSS.from_texts(
    texts = texts,
    embedding = embedding_function,
    table = "ignite_book",
    db_file = "/tmp/vss.db"
)

In [None]:
print ("Loading finished!")

In [None]:
# A similarity/semantic search
# prompt
question = "What is Data partitioning in Ignite?"
data = db.similarity_search(question)

In [None]:
# print results
print(data[0].page_content)

In [None]:
# LLM
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
llm = Ollama(
    model = "llama3.1",
    base_url='http://192.168.1.124:11434',
    verbose = True,
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),
)

In [None]:
# QA chain
from langchain.chains import RetrievalQA
from langchain import hub
import os

os.environ["LANGCHAIN_API_KEY"] = "ADD_YOUR_KEY_HERE"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = 'default'


In [None]:

QA_CHAIN_PROMPT = hub.pull("rlm/rag-prompt-llama")

qa_chain = RetrievalQA.from_chain_type(
    llm,
    # we create a retriever to interact with the db using an augmented context
    retriever = db.as_retriever(), 
    chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT},
)

In [None]:
result = qa_chain({"query": question})

In [None]:
print ("Finished!")