In [2]:
import os
from langchain_community.document_loaders import (
    WebBaseLoader,
    PyPDFLoader,
    DirectoryLoader,
)

import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import glob

from tqdm import tqdm
from dotenv import load_dotenv

from langchain_huggingface import HuggingFaceEmbeddings

import lancedb
from langchain_community.vectorstores import LanceDB

ModuleNotFoundError: No module named 'langchain_community'

In [None]:
load_dotenv()

# url_loader = WebBaseLoader("https://gameofthrones.fandom.com/wiki/Jon_Snow")
# documents_loader = DirectoryLoader('data', glob="./*.pdf", loader_cls=PyPDFLoader)
pdf_files = glob.glob("data/*.pdf")
print("Loading documents: ", pdf_files)
all_docs = []

for file_path in tqdm(pdf_files, desc="Reading books"):
    # Load each document using PyPDFLoader
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    all_docs.extend(docs)

# data_docs = documents_loader.load()

docs = all_docs


In [None]:
def clean_text(text):
    # Replace multiple newlines or special characters with a space
    cleaned = re.sub(r"\n+", " ", text)
    # Remove any other special characters (optional)
    cleaned = re.sub(r"[_\n]", " ", cleaned)
    # Normalize spaces (replace multiple spaces with a single space)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned


for doc in docs:
    doc.page_content = clean_text(doc.page_content)

all_chunks = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

for doc in tqdm(docs, desc="Splitting documents"):
    chunks = text_splitter.split_documents([doc])
    all_chunks.extend(chunks)

for i, chunk in enumerate(all_chunks[100:102]):
    print(f"Chunk {i + 1}:\n{chunk.page_content}\n")


In [None]:
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [None]:
query = "Hello I want to see the length of the embeddings for this document."
print(len(embeddings.embed_documents([query])[0]))


db = lancedb.connect("lance_database")
table = db.create_table(
    "rag_tmt",
    data=[
        {
            "vector": embeddings.embed_query("Hello Computer"),
            "text": "Hello computer!",
            "id": "1",
        }
    ],
    mode="overwrite",
)


In [None]:
db

In [None]:
docsearch = LanceDB.from_documents(all_chunks, embeddings, connection=db)

In [None]:
docsearch

In [None]:
from langchain.prompts.prompt import PromptTemplate
from langchain.schema.messages import get_buffer_string
from langchain_core.prompts import ChatPromptTemplate


def convert_chat_to_prompt(chat_template: ChatPromptTemplate) -> PromptTemplate:
    # Format the messages in the chat template without resolving any variables
    messages = chat_template.format_messages()

    # Convert the list of messages into a string
    message_string = get_buffer_string(messages)

    # Create a new PromptTemplate instance with the message string
    prompt_template = PromptTemplate.from_template(message_string)

    return prompt_template

In [None]:
chat_template = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful AI bot. Your name is Jarvis."),
        ("human", "Hello, how are you doing?"),
        ("ai", "I'm doing well, thanks!"),
        (
            "human",
            "I need your help in finding out the answer to the following question using the relevant information added:-",
        ),
        ("human", "{user_input}"),
    ]
)


# my_query = "What is the name of the pony that Sam Gamgee acquires in Bree after Bill Ferny sells them Bill the Pony?"
my_query = "What different names does Aragorn have throughout the book?"

messages = chat_template.format_messages(name="Sid", user_input=my_query)

prompt = ChatPromptTemplate.from_messages(messages)


In [None]:
# prompt = ChatPromptTemplate.from_template(chat_template)

retriever = docsearch.as_retriever(search_kwargs={"k": 15})
docs = retriever.invoke(my_query)
for doc in docs:
    print(doc.metadata["source"])

In [None]:
from langchain_community.llms import HuggingFaceHub

# Model architecture
llm_repo_id = "huggingfaceh4/zephyr-7b-alpha"
model_kwargs = {"temperature": 0.5, "max_length": 4096, "max_new_tokens": 2048}
model = HuggingFaceHub(repo_id=llm_repo_id, model_kwargs=model_kwargs)

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "query": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

print(retriever)
print(rag_chain)
response = rag_chain.invoke(my_query)

print(response)
