In [1]:
!pip install transformers datasets faiss-cpu langchain langchain-community

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.4-py3-none-any.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading langchain-0.3.6-py3-n

In [4]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from datasets import load_dataset
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain import LLMChain

# Step 1: Load the Full Train Split in Streaming Mode
dataset = load_dataset("trivia_qa", "rc", split="train", streaming=True)

# Step 2: Initialize Smaller Model Components
# Using distilbert-base-uncased-distilled-squad as a smaller question-answering model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

# Set up the pipeline for question-answering
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Define a function to generate a response using the question-answering pipeline
def generate_response(query):
    # Process only the first 100 entries from the streaming dataset
    for i, entry in enumerate(dataset):
        if i >= 100:  # Limit to 100 entries
            break
        context = entry["question"]  # TriviaQA questions are used here as context for simplicity
        result = qa_pipeline({
            "question": query,
            "context": context
        })
        print("Context:", context)
        print("Answer:", result['answer'])
        return result['answer']  # Return only the first answer for simplicity

# Test the pipeline
if __name__ == "__main__":
    query = "Explain retrieval-augmented generation."
    print("Response:", generate_response(query))


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]



Context: Which American-born Sinclair won the Nobel Prize for Literature in 1930?
Answer: Which American-born Sinclair
Response: Which American-born Sinclair


In [5]:
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain import LLMChain

# Step 1: Load the dataset
dataset = load_dataset("trivia_qa", "rc", split="train", streaming=True)

# Step 2: Initialize Embedding and Question-Answering Models
# Smaller model for embeddings
embed_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
embed_model = AutoModel.from_pretrained("distilbert-base-uncased")

# Question-answering model
qa_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

# Step 3: Embed Contexts for Retrieval
def embed_text(text):
    inputs = embed_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = embed_model(**inputs)
    # Use mean pooling to obtain a fixed-size embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Embed the first 100 contexts from the dataset
contexts = []
embeddings = []
for i, entry in enumerate(dataset):
    if i >= 100:
        break
    context = entry["question"]
    contexts.append(context)
    embeddings.append(embed_text(context))

# Convert embeddings to a single numpy array for similarity comparisons
embeddings = np.vstack(embeddings)

# Step 4: Define a Function for Retrieving the Most Similar Context
def retrieve_context(query, contexts, embeddings, top_k=3):
    query_embedding = embed_text(query)
    # Calculate cosine similarity with all contexts
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    # Get indices of top k most similar contexts
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    # Retrieve the top k contexts
    return [contexts[idx] for idx in top_indices]

# Step 5: Set Up LangChain Prompt Template
prompt_template = PromptTemplate(
    input_variables=["query", "context"],
    template="Question: {query}\nContext: {context}\nAnswer:"
)
llm_chain = LLMChain(prompt=prompt_template, llm=HuggingFacePipeline(pipeline=qa_pipeline))

# Step 6: Generate Response with Iterative Retrieval-Augmented Generation
def generate_response(query):
    # Retrieve top 3 contexts for the query
    top_contexts = retrieve_context(query, contexts, embeddings)

    # Initialize an empty answer
    final_answer = ""

    # Iteratively use retrieved contexts to improve answer
    for context in top_contexts:
        result = qa_pipeline({
            "question": query,
            "context": context
        })
        answer = result["answer"]
        print("Context:", context)
        print("Answer:", answer)

        # Concatenate answers for demonstration
        final_answer += answer + " "

    return final_answer.strip()

# Test the pipeline
if __name__ == "__main__":
    query = "Explain retrieval-augmented generation."
    print("Final Response:", generate_response(query))


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Context: Banting and Best pioneered the use of what?
Answer: Banting and Best pioneered the use of what
Context: In what decade were video recorders first developed?
Answer: In what decade
Context: In which country did he widespread use of ISDN begin in 1988?
Answer: 1988
Final Response: Banting and Best pioneered the use of what In what decade 1988
