In [27]:
# Install necessary libraries
!pip install pinecone-client langchain pandas numpy jsonlines pdfplumber sentence-transformers transformers

# Import required libraries
from pinecone import Pinecone, ServerlessSpec
import json
import pdfplumber
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, pipeline
from tqdm import tqdm
import pandas as pd

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
# Initialize Pinecone
PINECONE_API_KEY = "**************************************"
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "rag-textbook-index"

# Create Pinecone index if it doesn't exist
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
    name=index_name,
    dimension=384,  # Match the dimension of SentenceTransformer embeddings
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

index_info = pc.describe_index(index_name)
host = index_info.host
index = pc.Index(index_name, host=host)
print(f'Connected to Pinecone index: {index_name}')

Connected to Pinecone index: rag-textbook-index


In [47]:
index_info = pc.describe_index(index_name)
print(f"Index dimension: {index_info.dimension}")

Index dimension: 384


In [48]:
# Load queries from JSON file
queries_file = "/content/drive/MyDrive/RAG_Project/Dataset/queries.json"
with open(queries_file, "r") as f:
    queries = json.load(f)

# Display sample queries
print("Sample queries:")
print(json.dumps(queries[:2], indent=4))  # Display first two queries

Sample queries:
[
    {
        "query_id": "1",
        "question": "What is the scientific method in psychology?"
    },
    {
        "query_id": "2",
        "question": "What are the basic parts of a neuron?"
    }
]


In [30]:
# Load PDF content using pdfplumber
pdf_file = "/content/drive/MyDrive/RAG_Project/Dataset/book.pdf"
book_content = []
with pdfplumber.open(pdf_file) as pdf:
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        book_content.append({"page": i+1, "text": text})

print("Sample book content:")
print(book_content[:3])  # Display first three pages

Sample book content:
[{'page': 1, 'text': ''}, {'page': 2, 'text': ''}, {'page': 3, 'text': 'Psychology 2e\nSENIOR CONTRIBUTING AUTHORS\nROSE M. SPIELMAN, FORMERLY OF QUINNIPIAC UNIVERSITY\nWILLIAM J. JENKINS, MERCER UNIVERSITY\nMARILYN D. LOVETT, SPELMAN COLLEGE'}]


In [49]:
# Define chunking parameters
chunk_size = 512  # Max number of tokens per chunk
overlap = 50  # Number of overlapping tokens between chunks

# Function to chunk text
def chunk_text(text, chunk_size, overlap):
    tokens = text.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = " ".join(tokens[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

In [50]:

# Generate chunks from book content
chunks = []
for page in book_content:
    page_text = page.get("text", "")
    page_number = page.get("page", "N/A")
    if page_text.strip():  # Skip empty pages
        page_chunks = chunk_text(page_text, chunk_size, overlap)
        for chunk in page_chunks:
            chunks.append({"text": chunk, "metadata": {"page": page_number}})

print("Sample chunks:")
for chunk in chunks[:2]:
    print(chunk)

Sample chunks:
{'text': 'Psychology 2e SENIOR CONTRIBUTING AUTHORS ROSE M. SPIELMAN, FORMERLY OF QUINNIPIAC UNIVERSITY WILLIAM J. JENKINS, MERCER UNIVERSITY MARILYN D. LOVETT, SPELMAN COLLEGE', 'metadata': {'page': 3}}
{'text': 'OpenStax Rice University 6100 Main Street MS-375 Houston, Texas 77005 To learn more about OpenStax, visit https://openstax.org. Individual print copies and bulk orders can be purchased through our website. ©2020 Rice University. Textbook content produced by OpenStax is licensed under a Creative Commons Attribution 4.0 International License (CC BY 4.0). Under this license, any user of this textbook or the textbook contents herein must provide proper attribution as follows: - If you redistribute this textbook in a digital format (including but not limited to PDF and HTML), then you must retain on every page the following attribution: “Access for free at openstax.org.” - If you redistribute this textbook in a print format, then you must include on every physical p

In [51]:
# Initialize Sentence Transformers model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and effective model

# Function to generate embeddings
def get_embedding(text):
    return embedding_model.encode(text)

In [52]:
# Upsert chunks into Pinecone with embeddings
for i, chunk in enumerate(chunks):
    chunk_text = chunk["text"]
    chunk_metadata = chunk["metadata"]

    embedding = get_embedding(chunk_text)
    chunk_metadata["text"] = chunk_text

    index.upsert([
        {
            "id": f"chunk-{i+1}",
            "values": embedding,
            "metadata": chunk_metadata
        }
    ])

    if (i + 1) % 50 == 0:
        print(f"Processed {i + 1}/{len(chunks)} chunks...")

print("All chunks have been embedded and stored in Pinecone.")

Processed 50/1100 chunks...
Processed 100/1100 chunks...
Processed 150/1100 chunks...
Processed 200/1100 chunks...
Processed 250/1100 chunks...
Processed 300/1100 chunks...
Processed 350/1100 chunks...
Processed 400/1100 chunks...
Processed 450/1100 chunks...
Processed 500/1100 chunks...
Processed 550/1100 chunks...
Processed 600/1100 chunks...
Processed 650/1100 chunks...
Processed 700/1100 chunks...
Processed 750/1100 chunks...
Processed 800/1100 chunks...
Processed 850/1100 chunks...
Processed 900/1100 chunks...
Processed 950/1100 chunks...
Processed 1000/1100 chunks...
Processed 1050/1100 chunks...
Processed 1100/1100 chunks...
All chunks have been embedded and stored in Pinecone.


In [56]:
def retrieve_context(query, top_k=30, min_score=0.5):
    # Generate embedding for the query
    query_embedding = get_embedding(query).tolist()  # Convert NumPy array to list

    # Query Pinecone index
    results = index.query(
        vector=query_embedding,  # Pass the query embedding as a list
        top_k=top_k,
        include_metadata=True,
        include_values=False
    )

    # Extract retrieved text and scores
    retrieved_chunks = []
    for match in results.get('matches', []):
        score = match.get('score', 0)
        metadata = match.get('metadata', {})

        # Only keep results with a score higher than min_score
        if score >= min_score and 'text' in metadata:
            retrieved_chunks.append({
                "text": metadata["text"],
                "page": metadata.get("page", "N/A"),
                "score": score
            })

    return retrieved_chunks

In [57]:
# Adaptive retrieval for shorter queries
def adaptive_retrieve_context(query):
    short_query = len(query.split()) <= 5
    top_k = 10 if short_query else 30
    return retrieve_context(query, top_k=top_k)

# Function to rerank retrieved context
def rerank_retrieved_context(retrieved_chunks, query):
    query_terms = set(query.lower().split())

    for chunk in retrieved_chunks:
        text_terms = set(chunk["text"].lower().split())
        chunk["keyword_overlap"] = len(query_terms & text_terms)  # Count shared words

    # Sort by highest similarity score + keyword overlap
    retrieved_chunks.sort(key=lambda x: (x["score"], x["keyword_overlap"]), reverse=True)
    return retrieved_chunks[:10]  # Keep only top-ranked results


In [58]:
# Test retrieval with a sample query
sample_query = "What is the scientific method in psychology?"
retrieved_context = retrieve_context(sample_query)

print("Retrieved Context:")
for i, context in enumerate(retrieved_context):
    print(f"{i+1}. Page: {context['page']} | Score: {context['score']:.4f} | Text: {context['text'][:200]}...")

Retrieved Context:
1. Page: 20.0 | Score: 0.6087 | Text: 8 1 • Introduction to Psychology 2002). Nash was the subject of the 2001 movieA Beautiful Mind. Why did these people have these experiences? How does the human brain work? And what is the connection b...
2. Page: 44.0 | Score: 0.5714 | Text: 32 1 • Review Questions an undergraduate education in psychology are useful in a variety of work contexts. Review Questions 1. Which of the following was mentioned as a skill to which psychology stude...
3. Page: 42.0 | Score: 0.5504 | Text: 30 1 • Key Terms Key Terms American Psychological Association (APA) professional organization representing psychologists in the United States behaviorism focus on observing and controlling behavior bi...
4. Page: 50.0 | Score: 0.5421 | Text: 38 2 • Psychological Research applied behavior analysis (ABA), social communication groups, social skills groups, occupational therapy, and even medication options. If Maria asked you for advice or gu...
5. Page: 32.0

In [59]:
# Initialize QA model (FLAN-T5)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
qa_model = pipeline("text2text-generation", model="google/flan-t5-large", device=0)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


In [60]:
# Function to generate answers
def generate_answer(query, retrieved_chunks, max_input_tokens=400):
    context_text = " ".join([chunk["text"] for chunk in retrieved_chunks])
    tokenized_context = tokenizer.encode(context_text, truncation=True, max_length=max_input_tokens, return_tensors="pt")
    trimmed_context = tokenizer.decode(tokenized_context[0], skip_special_tokens=True)
    prompt = (
        "Using the textbook excerpts below, provide a complete and accurate answer to the question. "
        "Ensure your response is well-structured and relevant to the text. "
        "If the textbook does not contain enough information, say so.\n\n"
        f"Textbook Excerpts:\n{trimmed_context}\n\n"
        f"Question: {query}\n\n"
        "Answer in 3-4 sentences:"
    )

    response = qa_model(prompt, max_length=100, do_sample=True, temperature=0.7, top_p=0.9)[0]["generated_text"]
    return response

In [61]:
# Test answer generation
sample_query = "What is the scientific method in psychology?"
generated_answer = generate_answer(sample_query, retrieved_context)
print("Generated Answer:")
print(generated_answer)

# Batch processing for all queries
batch_inputs = []
query_ids = []
retrieved_contexts = []

for query in tqdm(queries, desc="🔍 Retrieving Contexts"):
    query_id = query["query_id"]
    question = query["question"]

    retrieved_context = adaptive_retrieve_context(question)
    retrieved_context = rerank_retrieved_context(retrieved_context, question)

    referenced_pages = list(set(str(chunk["page"]) for chunk in retrieved_context if "page" in chunk))
    references_json = json.dumps({"sections": [], "pages": referenced_pages})

    context_text = " ".join([chunk["text"] for chunk in retrieved_context])
    tokenized_context = tokenizer.encode(context_text, truncation=True, max_length=400, return_tensors="pt")
    trimmed_context = tokenizer.decode(tokenized_context[0], skip_special_tokens=True)

    prompt = (
        "Using the textbook excerpts below, provide a complete and accurate answer to the question. "
        "Ensure your response is well-structured and relevant to the text. "
        "If the textbook does not contain enough information, say so.\n\n"
        f"Textbook Excerpts:\n{trimmed_context}\n\n"
        f"Question: {question}\n\n"
        "Answer in 3-4 sentences:"
    )

    # Store inputs for batch processing
    batch_inputs.append(prompt)
    query_ids.append(query_id)
    retrieved_contexts.append((context_text, references_json))


Generated Answer:
Scientists test that which is perceivable and measurable.


🔍 Retrieving Contexts: 100%|██████████| 50/50 [00:04<00:00, 10.17it/s]


In [62]:
# Generate answers in batch
print("Generating Answers in batch...")
batch_answers = qa_model(batch_inputs, max_length=100, do_sample=True, temperature=0.7, top_p=0.9)

# Convert results to submission format
submission_data = []
for i in range(len(query_ids)):
    submission_data.append({
        "ID": query_ids[i],
        "context": retrieved_contexts[i][0],
        "answer": batch_answers[i]["generated_text"],
        "references": retrieved_contexts[i][1]
    })

Generating Answers in batch...


In [63]:
# Save submission file
submission_df = pd.DataFrame(submission_data)
submission_df.to_csv("/content/drive/MyDrive/RAG_Project/Dataset/submission.csv", index=False)
print("Submission file saved as submission.csv.")

Submission file saved as submission.csv.


In [67]:
submission_df.iloc[1]

Unnamed: 0,1
ID,2
context,3.2 • Cells of the Nervous System 79 the other...
answer,A neuron’s outer surface is made up of asemipe...
references,"{""sections"": [], ""pages"": [""97.0"", ""94.0"", ""91..."
