In [1]:
!pip install PyPDF2 numpy sentence-transformers groq qdrant-client python-dotenv arxiv



# Download the Arxiv PDF document to be used for injecting in Qdrant

In [2]:
import arxiv
# Specify the arXiv ID with version number (e.g., '1707.08567v2')
arxiv_id = "2408.16765v1"

# Search for the specific version of the paper using the arXiv ID
search = arxiv.Search(
    id_list=[arxiv_id]
)

# Download the PDF of the specific version
for result in search.results():
    print(f"Downloading: {result.title} (version: {arxiv_id})")
    result.download_pdf( filename=arxiv_id+".pdf")
    print("Download completed.")

  for result in search.results():


Downloading: A Score-Based Density Formula, with Applications in Diffusion Generative Models (version: 2408.16765v1)
Download completed.


# HyDe with Qdrant DB

# Import the lIbraries

In [3]:
import os
from dotenv import load_dotenv
import PyPDF2
import numpy as np
from sentence_transformers import SentenceTransformer
#import openai
from groq import Groq
from qdrant_client import QdrantClient
from qdrant_client.http import models
from google.colab import userdata


  from tqdm.autonotebook import tqdm, trange


# set up API Grok and Qdrant

In [4]:
# Set up API clients
groq_client = Groq(api_key=userdata.get('GROQ_API_KEY'))
qdrant_client = QdrantClient(userdata.get('QDRANT_URL'), api_key=userdata.get('QDRANT_KEY'))

#qdrant_client = QdrantClient(os.getenv('QDRANT_URL'), api_key=os.getenv('QDRANT_API_KEY'))

# Define constants

In [5]:
# Constants
COLLECTION_NAME = "document_chunks"
VECTOR_SIZE = 384  # Dimension of the sentence-transformers model output

# utililty functions

In [6]:
# 2: Define utility functions
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text


In [7]:
def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

model = SentenceTransformer('all-MiniLM-L6-v2')



In [8]:
def generate_embeddings(chunks):
    return model.encode(chunks)

# Qdrant vector database functions

In [9]:
# Define Qdrant functions
def setup_qdrant_collection():
    qdrant_client.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(size=VECTOR_SIZE, distance=models.Distance.COSINE),
    )

def store_embeddings_in_qdrant(chunks, embeddings):
    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=idx,
                vector=embedding.tolist(),
                payload={"text": chunk}
            )
            for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings))
        ]
    )

def retrieve_relevant_chunks_qdrant(query_embedding, top_k=3):
    search_result = qdrant_client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding.tolist(),
        limit=top_k
    )
    return [hit.payload['text'] for hit in search_result]


# Large Language Model (LLM) functions

In [10]:
#  Define LLM functions
def llm_call(prompt, provider='openai', model_name='gpt-3.5-turbo'):
    if provider == 'groq':
        chat_completion = groq_client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model_name,
            temperature=0.5,
            max_tokens=1000,
        )
        return chat_completion.choices[0].message.content
    else:
        raise ValueError("Unsupported provider. Choose 'openai' or 'groq'.")

In [11]:
def generate_hyde_document(query, provider='groq', model_name='llama3-8b-8192'):
    prompt = f"Given the question '{query}', generate a short, relevant passage that might answer this question:"
    return llm_call(prompt, provider, model_name)

In [12]:
def rag_with_hyde(query, provider='groq', model_name='llama3-8b-8192'):
    hyde_doc = generate_hyde_document(query, provider, model_name)
    print(hyde_doc)
    hyde_embedding = model.encode([hyde_doc])[0]
    #print(hyde_embedding)
    relevant_chunks = retrieve_relevant_chunks_qdrant(hyde_embedding)
    print(relevant_chunks)
    context = " ".join(relevant_chunks)

    prompt = f"""Context: {context}

Question: {query}

Please provide a concise and accurate answer based on the given context. If the context doesn't contain enough information to answer the question fully, please state that and provide the best possible answer with the available information.

Answer:"""

    return llm_call(prompt, provider, model_name)

# Main execution

In [13]:
# 5: Main execution
# Replace 'your_document.pdf' with the path to your actual PDF file
pdf_path = '2408.16765v1.pdf'
pdf_text = extract_text_from_pdf(pdf_path)
text_chunks = chunk_text(pdf_text)
chunk_embeddings = generate_embeddings(text_chunks)

In [14]:
# Set up and populate Qdrant collection
setup_qdrant_collection()
store_embeddings_in_qdrant(text_chunks, chunk_embeddings)

  qdrant_client.recreate_collection(


# Querying the system

In [15]:
# 6: Query the system
query = "What are the main topics discussed in the document?"

# Using Groq with Llama 3
answer_groq_llama3 = rag_with_hyde(query, provider='groq', model_name='llama3-8b-8192')
print("Groq with Llama 3 Answer:", answer_groq_llama3)



Please provide the document, and I'll generate a short passage that summarizes the main topics discussed.
['oss in autoregressive model s. . . . . . . . . . . . . . . . . . . . . 10\n5 Proof of Theorem 1 10\n6 Discussion 13\n∗The authors contributed equally.\n†Department of Statistics, The Chinese University of Hong Ko ng, Hong Kong; Email: genli@cuhk.edu.hk .\n‡Department of Statistics, University of Wisconsin-Madiso n, Madison, WI 53706, USA; Email: yuling.yan@wisc.edu .\n1A Proof of Proposition 1 14\nB Proof of Proposition 2 16\nC More discussions on the density formulas 19\nD Technical details in Section 4 19\nD.1 Technical details in Section 4.1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 19\nD.2 Technical details in Section 4.2 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22\n1 Introduction\nScore-based generative models (SGMs) represent a groundbr eaking advancement in the realm of generative\nmodels, signiﬁcantly impacting mach

# Optional interactive querying

In [16]:
#  (Optional) Interactive querying
def interactive_query():
    while True:
        query = input("Enter your question (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break
        answer = rag_with_hyde(query)
        print("Answer:", answer)
        print("\n---\n")

In [17]:
#interactive_query()

# ReRanking with Hyde

## Constants


In [18]:
# Constants
COLLECTION_NAME = "document_chunks_reranked"
VECTOR_SIZE = 384  # Dimension of the sentence-transformers model output

In [19]:
from sentence_transformers import SentenceTransformer, CrossEncoder

## Initiliase the models

In [20]:
# Initialize models
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

## Utility Functions

In [21]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

def generate_embeddings(chunks):
    return bi_encoder.encode(chunks)



## Qdrant Functions

In [22]:
# Define Qdrant functions
def setup_qdrant_collection():
    qdrant_client.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(size=VECTOR_SIZE, distance=models.Distance.COSINE),
    )

def store_embeddings_in_qdrant(chunks, embeddings):
    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=idx,
                vector=embedding.tolist(),
                payload={"text": chunk}
            )
            for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings))
        ]
    )

def retrieve_relevant_chunks_qdrant(query_embedding, top_k=10):  # Increased top_k for reranking
    search_result = qdrant_client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding.tolist(),
        limit=top_k
    )
    return [(hit.payload['text'], hit.score) for hit in search_result]

## Main Execution:


In [23]:
#  Main execution
# Replace 'your_document.pdf' with the path to your actual PDF file
pdf_path = '2408.16765v1.pdf'
pdf_text = extract_text_from_pdf(pdf_path)
text_chunks = chunk_text(pdf_text)
chunk_embeddings = generate_embeddings(text_chunks)

In [24]:
# Set up and populate Qdrant collection
setup_qdrant_collection()
store_embeddings_in_qdrant(text_chunks, chunk_embeddings)

  qdrant_client.recreate_collection(


# Reranking & RAG Function using HyDe


In [25]:
# Define reranking function
def rerank_with_hyde(query, retrieved_chunks, top_k=3):
    hyde_doc = generate_hyde_document(query)

    # Combine query and hyde_doc
    enhanced_query = f"{query} {hyde_doc}"

    # Prepare input pairs for cross-encoder
    pair_inputs = [(enhanced_query, chunk) for chunk, _ in retrieved_chunks]

    # Get relevance scores
    relevance_scores = cross_encoder.predict(pair_inputs)

    # Sort chunks by relevance score
    reranked_chunks = sorted(zip(retrieved_chunks, relevance_scores), key=lambda x: x[1], reverse=True)

    # Return top_k reranked chunks
    return [chunk for (chunk, _), _ in reranked_chunks[:top_k]]

# Updated RAG function with reranking
def rag_with_hyde_and_reranking(query, provider='groq', model_name='llama3-8b-8192'):
    # Generate query embedding
    query_embedding = bi_encoder.encode([query])[0]

    # Retrieve initial set of relevant chunks
    retrieved_chunks = retrieve_relevant_chunks_qdrant(query_embedding)

    # Rerank chunks using HyDE
    reranked_chunks = rerank_with_hyde(query, retrieved_chunks)

    # Combine reranked chunks into context
    context = " ".join(reranked_chunks)

    prompt = f"""Context: {context}

Question: {query}

Please provide a concise and accurate answer based on the given context. If the context doesn't contain enough information to answer the question fully, please state that and provide the best possible answer with the available information.

Answer:"""

    return llm_call(prompt, provider, model_name)

## Querying the System:

In [26]:
# Using Groq with Llama 3 and reranking
answer_groq_llama3 = rag_with_hyde_and_reranking(query, provider='groq', model_name='llama3-8b-8192')
print("Groq with Llama 3 Answer (with reranking):", answer_groq_llama3)



Groq with Llama 3 Answer (with reranking): The main topics discussed in the document are:

1. Theoretical understanding of diffusion generative models, specifically the DDPM framework.
2. The optimization target of DDPM, which is derived from a variational lower bound on the log-likelihood (ELBO), and the lack of theoretical understanding why optimizing a lower bound is a valid approach.
3. The use of ELBO as a proxy for the negative log-likelihood of the data distribution and its application in other generative or learning frameworks.

The document also appears to discuss mathematical derivations and equations related to diffusion generative models, specifically the terms H1(x), H2(x), and H3(x), but the main topics are centered around the theoretical understanding and applications of DDPM.


## Interactive Querying (Optional)

In [27]:
# Optional) Interactive querying
def interactive_query_rerank_hyde():
    while True:
        query = input("Enter your question (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break
        answer = rag_with_hyde_and_reranking(query)
        print("Answer:", answer)
        print("\n---\n")

In [28]:
#interactive_query_rerank_hyde()