In [1]:
import openai
from typing import List, Dict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
class ReverseHyde:
    def __init__(self, api_key: str):
        openai.api_key = api_key
        self.model = "text-embedding-ada-002"

    def get_embedding(self, text: str) -> List[float]:
        client = openai.OpenAI()
        response = client.embeddings.create(input=text, model=self.model)
        return response.data[0].embedding

    def generate_reverse_hyde(self, chunk: str, n: int = 3) -> List[str]:
        prompt = f"""
        
Given the following text chunk, generate {n} different questions that this chunk would be a good answer to:

Chunk: {chunk}

Questions:
1."""

        client = openai.OpenAI()
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            n=1,
            stop=None,
            temperature=0.7,
        )

        print(response)
        questions = response.choices[0].message.content.strip().split('\n')
        return [q.split('. ', 1)[1] for q in questions if '. ' in q]

    def process_chunks(self, chunks: List[str]) -> Dict[str, List[str]]:
        processed_chunks = {}
        for chunk in chunks:
            processed_chunks[chunk] = self.generate_reverse_hyde(chunk)
        return processed_chunks

    def find_best_chunk(self, query: str, processed_chunks: Dict[str, List[str]]) -> str:
        query_embedding = self.get_embedding(query)
        
        best_similarity = -1
        best_chunk = None

        for chunk, questions in processed_chunks.items():
            chunk_embedding = self.get_embedding(chunk)
            question_embeddings = [self.get_embedding(q) for q in questions]
            
            similarities = cosine_similarity(
                [query_embedding], 
                [chunk_embedding] + question_embeddings
            )[0]
            
            max_similarity = np.max(similarities)
            
            if max_similarity > best_similarity:
                best_similarity = max_similarity
                best_chunk = chunk

        return best_chunk

## Loading API keys from environment variable

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
import os
# Usage example
api_key = os.getenv("OPENAI_API_KEY")
reverse_hyde = ReverseHyde(api_key)

chunks = [
    "The mitochondria is the powerhouse of the cell.",
    "Python is a high-level, interpreted programming language.",
    "The American Civil War lasted from 1861 to 1865."
]

processed_chunks = reverse_hyde.process_chunks(chunks)
query = "What generates energy in a cell?"
best_chunk = reverse_hyde.find_best_chunk(query, processed_chunks)

print(f"Query: {query}")
print(f"Best matching chunk: {best_chunk}")

ChatCompletion(id='chatcmpl-ABtFodUa2gtwBuLAPkEfsRF7nBP32', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='1. What organelle is known as the powerhouse of the cell?\n2. Which cellular component is responsible for generating energy?\n3. What is the main function of the mitochondria within a cell?', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1727397368, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=41, prompt_tokens=55, total_tokens=96, completion_tokens_details=CompletionTokensDetails(reasoning_tokens=0)))
ChatCompletion(id='chatcmpl-ABtFpKJljOb52YTyp3wgIL6aZG0Eh', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='What type of programming language is Python?\n2. How would you describe Python as a programming language?\n3. Is Python a compiled or interpreted