In [None]:
%%bash

pip install haystack-ai
pip install "datasets>=2.6.1"
pip install "sentence-transformers>=2.2.0"


In [None]:
from datasets import load_dataset
from haystack import Document
import os
from getpass import getpass
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

In [None]:


def load_Data()
  dataset1 = load_dataset("mvbhat/verdicts", split='train')
  docs1 = [Document(content=doc["facts"], meta={"verdict": doc["verdict"]}) for doc in dataset1]
  dataset2 = load_dataset("macadeliccc/US-SupremeCourtVerdicts", split='train[:500]')
  docs2 = [Document(content=doc["document"], meta={"verdict": doc["summary"]}) for doc in dataset2]
  return docs1 + docs2


In [None]:

def load_api()
  if "OPENAI_API_KEY" not in os.environ:
      os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
  generator = OpenAIGenerator(model="gpt-3.5-turbo")
  return generator

In [None]:

def ans_prompt()

  template = """
    Based on the following facts and verdict only

    {% for document in documents %}
        Facts: {{ document.meta['facts'] }}
        Verdict: {{ document.content }}

    {% endfor %}
    Please answer the question deliberately and do not add additional details:
    Question: {{question}}

    """
    prompt_builder = PromptBuilder(template=template)
    return template

In [None]:
def bool_prompt()
      template = """
      Based on the following documents, do you have enough information to provide a reliable answer to the question?

      Context:
      {% for document in documents %}
          Facts: {{ document.content }}
          Verdict: {{ document.meta['verdict'] }}
      {% endfor %}

      Question: {{ question }}
      Answer: Do I have enough information to fully answer the question? Yes or No.
      """
      prompt_builder = PromptBuilder(template=template)
      return prompt_builder

In [None]:

def create_docstore()
  document_store = InMemoryDocumentStore(embedding_similarity_function="bm25")
  return document_store

In [None]:

def create_pipeline()


  basic_rag_pipeline = Pipeline()
  # Add components to your pipeline
  basic_rag_pipeline.add_component("text_embedder", text_embedder)
  basic_rag_pipeline.add_component("retriever", retriever)
  basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
  basic_rag_pipeline.add_component("llm", generator)

  # Connect the components to each other
  basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
  basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
  basic_rag_pipeline.connect("prompt_builder", "llm")

In [None]:

def Initalize_doc_em():
  doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
  doc_embedder.warm_up()

In [None]:
def write_doc_Store(doc)
  docs_with_embeddings = doc_embedder.run(docs)
  document_store.write_documents(docs_with_embeddings["documents"])

In [None]:

def in_text_em()
  text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
  text_embedder.warm_up()  # Warm up the text embedder

In [None]:
def in_retriever(document_store)

  retriever = InMemoryEmbeddingRetriever(document_store)
  return retriver

checks pipeline

In [None]:
question = "write the facts u recive about Leroy Irvis"

response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

main function

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def generate_answer(query, initial_top_k=5, increment_step=5, max_attempts=3):
    """
    Retrieve documents and print similarity scores between the query and each document.

    Parameters:
    - query (str): The input query for retrieval.
    - initial_top_k (int): The initial number of documents to retrieve.
    - increment_step (int): The number of additional documents to retrieve if needed.
    - max_attempts (int): The maximum number of attempts to retrieve more documents.
    """
    top_k = initial_top_k
    attempt = 0
    is_enough_info = False

    while not is_enough_info and attempt < max_attempts:
        attempt += 1

        # Embed the query
        query_embedding = text_embedder.run(query)["embedding"]

        # Retrieve top_k relevant documents
        retrieval_results = retriever.run(query_embedding=query_embedding, top_k=top_k)
        retrieved_docs = retrieval_results["documents"]



        # Create an assessment prompt
        assessment_prompt_template = """
        Based on the following documents, do you have enough information to provide a reliable answer to the question?

        Context:
        {% for document in documents %}
            Facts: {{ document.content }}
            Verdict: {{ document.meta['verdict'] }}
        {% endfor %}

        Question: {{ question }}
        Answer: Do I have enough information to fully answer the question? Yes or No.
        """
        assessment_prompt_builder = PromptBuilder(template=assessment_prompt_template)
        assessment_prompt = assessment_prompt_builder.run(documents=retrieved_docs, question=query)["prompt"]

        # Ask the LLM if there is enough information
        assessment_result = generator.run(prompt=assessment_prompt)
        assessment_reply = assessment_result.get("replies", [])[0] if assessment_result.get("replies") else ""

        print("Assessment Result:")
        print(assessment_reply)

        # Check the LLM's response
        if "Yes" in assessment_reply:
            is_enough_info = True
        else:
            print(f"Not enough information, attempting to retrieve more documents (Attempt {attempt})...")
            top_k += increment_step

    # Build the prompt with the retrieved documents
    if is_enough_info:
        temperature = 0.2

      # Run the prompt builder with the retrieved documents and the query
        final_prompt = prompt_builder.run(
          documents=retrieved_docs,
          question=query,
          params={"temperature": temperature}  # Pass temperature here if supported
        )["prompt"]

        # Generate the answer using the LLM
        result = generator.run(prompt=final_prompt)

        # Extract and print only the replies
        replies = result.get("replies", [])
        if replies:
            for reply in replies:
                print("Generated Answer:")
                print(reply)
        else:
            print("No replies found")
    else:
        print("Unable to retrieve enough information after multiple attempts.")

# Example usage
query = "If I stole but expressed remorse, is there a difference in punishment because I expressed remorse? What was the punishment if I didn't express remorse?"
generate_answer(query, initial_top_k=5, increment_step=2, max_attempts=5)


check similarity between query and doc

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def generate_answer(query, top_k=1):
    """
    Retrieve documents and print similarity scores between the query and each document's facts.

    Parameters:
    - query (str): The input query for retrieval.
    - top_k (int): The maximum number of documents to retrieve.
    """
    # Embed the query
    query_embedding_result = text_embedder.run(query)
    # Ensure the query embedding is in the correct format
    query_embedding = np.array(query_embedding_result["embedding"])
    if query_embedding.ndim > 1:
        query_embedding = query_embedding.flatten()

    # Retrieve top_k relevant documents
    retrieval_results = retriever.run(query_embedding=query_embedding, top_k=top_k)
    retrieved_docs = retrieval_results["documents"]

    # Calculate and print the similarity measure between the query and each document's facts
    print("Similarity Scores for Retrieved Documents:")
    for i, doc in enumerate(retrieved_docs, 1):
        # Ensure document embeddings are properly formatted
        if 'embedding' in doc.meta and doc.meta['embedding'] is not None:
            document_embedding = np.array(doc.meta['embedding'])
            if document_embedding.ndim > 1:
                document_embedding = document_embedding.flatten()
        else:
            doc_embedding_result = text_embedder.run(doc.content)
            document_embedding = np.array(doc_embedding_result["embedding"])
            if document_embedding.ndim > 1:
                document_embedding = document_embedding.flatten()

        query_embedding_reshaped = query_embedding.reshape(1, -1)
        document_embedding_reshaped = document_embedding.reshape(1, -1)
        similarity_score = cosine_similarity(query_embedding_reshaped, document_embedding_reshaped)[0][0]

        print(f"Document {i}:")
        print(f"Similarity Score: {similarity_score:.2f}")
       # print(f"Facts: {doc.content}")
       # print(f"Verdict: {doc.meta.get('verdict', 'No Verdict Available')}\n")

# Example usage
query = "What is the minimum punishment for eating ice cream?"
generate_answer(query, top_k=30)




most generic way to Initializ the program

In [None]:

#Initializing the DocumentStore
from haystack.document_stores.in_memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore(embedding_similarity_function="bm25")
#----------------------------------------
#fetch the data
from datasets import load_dataset
from haystack import Document

dataset1 = load_dataset("mvbhat/verdicts", split='train')
docs1 = [Document(content=doc["facts"], meta={"verdict": doc["verdict"]}) for doc in dataset1]
dataset2 = load_dataset("macadeliccc/US-SupremeCourtVerdicts", split='train[:500]')
docs2 = [Document(content=doc["document"], meta={"verdict": doc["summary"]}) for doc in dataset2]
docs = docs1 + docs2


#----------------------------------------
#Initalize a Document Embedder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
doc_embedder.warm_up()
#---------------------------------------
#Write Documents to the DocumentStore
docs_with_embeddings = doc_embedder.run(docs)
document_store.write_documents(docs_with_embeddings["documents"])

#---------------------------------------
#Initialize a Text Embedder
from haystack.components.embedders import SentenceTransformersTextEmbedder

text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
text_embedder.warm_up()  # Warm up the text embedder
#---------------------------------------
#Initialize the Retriever
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

retriever = InMemoryEmbeddingRetriever(document_store)
#--------------------------------------
#Define a Template Prompt
from haystack.components.builders import PromptBuilder
template = """
  Based on the following facts and verdict only

  {% for document in documents %}
      Facts: {{ document.meta['facts'] }}
      Verdict: {{ document.content }}

  {% endfor %}
  Please answer the question deliberately and do not add additional details:
  Question: {{question}}

  """
prompt_builder = PromptBuilder(template=template)
#-------------------------------------
#Initialize a Generator
import os
from getpass import getpass
from haystack.components.generators import OpenAIGenerator

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
generator = OpenAIGenerator(model="gpt-3.5-turbo")

#--------------------------------
# build the pipeline
from haystack import Pipeline

basic_rag_pipeline = Pipeline()
# Add components to your pipeline
basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", generator)

# Connect the components to each other
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm")