In [1]:
import os
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, DirectoryLoader

import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import glob

from tqdm import tqdm

from langchain_huggingface import HuggingFaceEmbeddings

import lancedb
from langchain_community.vectorstores import LanceDB

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:

HF_TOKEN = "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"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN

# url_loader = WebBaseLoader("https://gameofthrones.fandom.com/wiki/Jon_Snow")
#documents_loader = DirectoryLoader('data', glob="./*.pdf", loader_cls=PyPDFLoader)
pdf_files = glob.glob('data/*.pdf')
print("Loading documents: ", pdf_files)
all_docs = []

for file_path in tqdm(pdf_files, desc="Reading books"):
    # Load each document using PyPDFLoader
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    all_docs.extend(docs)

#data_docs = documents_loader.load()

docs = all_docs


Loading documents:  ['data\\The Lord of the Rings - The Fellowship of the Ring.pdf', 'data\\The Lord of the Rings - The Return of the King.pdf', 'data\\The Lord of the Rings - The Two Towers.pdf', 'data\\The Silmarillion.pdf']


Reading books: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:27<00:00,  6.87s/it]


In [3]:

def clean_text(text):
    # Replace multiple newlines or special characters with a space
    cleaned = re.sub(r'\n+', ' ', text)
    # Remove any other special characters (optional)
    cleaned = re.sub(r'[_\n]', ' ', cleaned)
    # Normalize spaces (replace multiple spaces with a single space)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned


for doc in docs:
    doc.page_content = clean_text(doc.page_content)

all_chunks = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

for doc in tqdm(docs, desc="Splitting documents"):
    chunks = text_splitter.split_documents([doc])
    all_chunks.extend(chunks)

for i, chunk in enumerate(all_chunks[100:102]):
    print(f"Chunk {i + 1}:\n{chunk.page_content}\n")



Splitting documents: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1805/1805 [00:00<00:00, 2487.03it/s]

Chunk 1:
'Well, er, yes, I suppose so,' stammered Bilbo. 'Where is it?' 'In an envelope, if you must know,' said Bilbo impatiently. 'There on the mantelpiece. Well, no! Here it is in my pocket!' He hesitated. 'Isn't that odd now?' he said softly to himself. 'Yet after all, why not? Why shouldn't it stay there?' Gandalf looked again very hard at Bilbo, and there was a gleam in his eyes. 'I think, Bilbo,' he said quietly, 'I should leave it behind. Don't you want to?' 'Well yes – and no. Now it comes to it, I don't like parting with it at all, I may say. And I don't really see why I should. Why do you want me to?' he asked, and a curious change came over his voice. It was sharp with suspicion and annoyance. 'You are always badgering me about my ring; but you have never bothered me about the other things that I got on my journey.' 'No, but I had to badger you,' said Gandalf. 'I wanted the truth. It was important. Magic rings are – well, magical; and they are rare and curious. I was profes




In [4]:

embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)



In [5]:

query = "Hello I want to see the length of the embeddings for this document."
print(len(embeddings.embed_documents([query])[0]))


db = lancedb.connect("lance_database")
table = db.create_table(
    "rag_tmt",
    data=[
        {
            "vector": embeddings.embed_query("Hello Computer"),
            "text": "Hello computer!",
            "id": "1",
        }
    ],
    mode="overwrite",
)


384


In [6]:
db

LanceDBConnection(D:\Pet Projects\teach-me-this\lance_database)

In [7]:
docsearch = LanceDB.from_documents(all_chunks, embeddings, connection=db)

In [8]:
docsearch

<langchain_community.vectorstores.lancedb.LanceDB at 0x1fbfd2d7ad0>

In [11]:
from langchain.prompts.prompt import PromptTemplate
from langchain.schema.messages import get_buffer_string
from langchain_core.prompts import ChatPromptTemplate


def convert_chat_to_prompt(chat_template: ChatPromptTemplate) -> PromptTemplate:
    # Format the messages in the chat template without resolving any variables
    messages = chat_template.format_messages()
    
    # Convert the list of messages into a string
    message_string = get_buffer_string(messages)
    
    # Create a new PromptTemplate instance with the message string
    prompt_template = PromptTemplate.from_template(message_string)
    
    return prompt_template

In [13]:
chat_template = ChatPromptTemplate.from_messages(
	[
    	("system", "You are a helpful AI bot. Your name is Jarvis."),
    	("human", "Hello, how are you doing?"),
    	("ai", "I'm doing well, thanks!"),
        ("human", "I need your help in finding out the answer to the following question using the relevant information added:-"),
    	("human", "{user_input}"),
	]
)


#my_query = "What is the name of the pony that Sam Gamgee acquires in Bree after Bill Ferny sells them Bill the Pony?"
my_query = "What different names does Aragorn have throughout the book?"

messages = chat_template.format_messages(name="Sid", user_input=my_query)

prompt = ChatPromptTemplate.from_messages(messages)


In [14]:
#prompt = ChatPromptTemplate.from_template(chat_template)

retriever = docsearch.as_retriever(search_kwargs={"k": 15})
docs = retriever.invoke(my_query)
for doc in docs:
    print(doc.metadata['source'])

data\The Lord of the Rings - The Return of the King.pdf
data\The Lord of the Rings - The Two Towers.pdf
data\The Lord of the Rings - The Return of the King.pdf
data\The Silmarillion.pdf
data\The Lord of the Rings - The Return of the King.pdf
data\The Lord of the Rings - The Two Towers.pdf
data\The Lord of the Rings - The Return of the King.pdf
data\The Lord of the Rings - The Return of the King.pdf
data\The Lord of the Rings - The Return of the King.pdf
data\The Silmarillion.pdf
data\The Lord of the Rings - The Two Towers.pdf
data\The Silmarillion.pdf
data\The Lord of the Rings - The Return of the King.pdf
data\The Lord of the Rings - The Return of the King.pdf
data\The Lord of the Rings - The Return of the King.pdf


In [19]:
from langchain_community.llms import HuggingFaceHub

# Model architecture
llm_repo_id = "huggingfaceh4/zephyr-7b-alpha"
model_kwargs = {"temperature": 0.5, "max_length": 4096, "max_new_tokens": 2048}
model = HuggingFaceHub(repo_id=llm_repo_id, model_kwargs=model_kwargs)

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
        {"context": retriever,  "query": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
)

print(retriever)
print(rag_chain)
response = rag_chain.invoke(my_query)

print(response)


tags=['LanceDB', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.lancedb.LanceDB object at 0x000001FBFD2D7AD0> search_kwargs={'k': 15}
first={
  context: VectorStoreRetriever(tags=['LanceDB', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.lancedb.LanceDB object at 0x000001FBFD2D7AD0>, search_kwargs={'k': 15}),
  query: RunnablePassthrough()
} middle=[ChatPromptTemplate(input_variables=[], messages=[SystemMessage(content='You are a helpful AI bot. Your name is Jarvis.'), HumanMessage(content='Hello, how are you doing?'), AIMessage(content="I'm doing well, thanks!"), HumanMessage(content='I need your help in finding out the answer to the following question using the relevant information added:-'), HumanMessage(content='What different names does Aragorn have throughout the book?')]), HuggingFaceHub(client=<InferenceClient(model='huggingfaceh4/zephyr-7b-alpha', timeout=None)>, repo_id='huggingfaceh4/zephyr-7b-alpha', task='text-generation', m