In [1]:
# !pip install transformers torch faiss-cpu pdfplumber sentence-transformers


Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting torch
  Downloading torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.23 requires tokenizers<=0.20.3,>=0.13.2, but you have tokenizers 0.21.0 which is incompatible.


In [None]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from transformers import pipeline

# Initialize ChromaDB client
def initialize_chromadb():
    client = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory="./chromadb_store"  # Directory to persist the data
    ))
    collection = client.get_or_create_collection("pdf_embeddings")
    return client, collection

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Step 2: Split text into paragraphs
def split_into_paragraphs(text):
    return [para.strip() for para in text.split('\n') if para.strip()]

# Step 3: Embed and Store Paragraphs in ChromaDB
def store_paragraphs_in_chromadb(paragraphs, collection, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(paragraphs, convert_to_tensor=False)
    
    # Add to ChromaDB collection
    for idx, (para, embedding) in enumerate(zip(paragraphs, embeddings)):
        collection.add(
            documents=[para],
            metadatas=[{"id": idx}],
            embeddings=[embedding.tolist()]
        )
    print(f"Stored {len(paragraphs)} paragraphs in ChromaDB.")
    return model

# Step 4: Retrieve Relevant Paragraphs
def retrieve_relevant_paragraphs(query, collection, model, top_k=3):
    query_embedding = model.encode([query], convert_to_tensor=False).tolist()
    results = collection.query(query_embeddings=query_embedding, n_results=top_k)

    relevant_paragraphs = results["documents"][0]
    return relevant_paragraphs

# Step 5: Answer Questions Using Retrieved Paragraphs
def answer_question_with_rag(question, paragraphs):
    qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")
    best_answer = {"score": 0, "paragraph": "", "answer": ""}

    for para in paragraphs:
        try:
            result = qa_pipeline(question=question, context=para)
            if result['score'] > best_answer['score']:
                best_answer = {"score": result['score'], "paragraph": para, "answer": result['answer']}
        except Exception as e:
            print(f"Error processing paragraph: {e}")
            continue

    return best_answer

# Main Function
if __name__ == "__main__":
    pdf_path = "example.pdf"  # Replace with your PDF file path

    # Initialize ChromaDB
    print("\nInitializing ChromaDB...")
    chromadb_client, collection = initialize_chromadb()

    # Extract and process text
    print("\nExtracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    paragraphs = split_into_paragraphs(text)

    # Store paragraphs in ChromaDB
    print("\nStoring paragraphs in ChromaDB...")
    embedding_model = store_paragraphs_in_chromadb(paragraphs, collection)

    # Retrieve relevant paragraphs

    question = input("Enter your question: ")

    print("\nRetrieving relevant paragraphs...")
    relevant_paragraphs = retrieve_relevant_paragraphs(question, collection, embedding_model)

    # Answer the question
    print("\nAnswering the question...")
    result = answer_question_with_rag(question, relevant_paragraphs)

    # Display the results
    if result['score'] > 0:
        print("\nAnswer:", result['answer'])
        print("\nRelevant Paragraph:", result['paragraph'])
    else:
        print("\nNo relevant answer found.")


In [3]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Step 2: Split text into paragraphs
def split_into_paragraphs(text):
    return [para.strip() for para in text.split('\n') if para.strip()]

# Step 3: Create embeddings for paragraphs
def create_paragraph_embeddings(paragraphs, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(paragraphs, convert_to_tensor=False)
    return embeddings, model

# Step 4: Build a FAISS index
def build_faiss_index(embeddings):
    dim = len(embeddings[0])
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# Step 5: Retrieve relevant paragraphs
def retrieve_relevant_paragraphs(query, paragraphs, index, model, top_k=3):
    query_embedding = model.encode([query], convert_to_tensor=False)
    distances, indices = index.search(query_embedding, top_k)
    relevant_paragraphs = [paragraphs[i] for i in indices[0]]
    return relevant_paragraphs

# Step 6: Answer questions using retrieved paragraphs
def answer_question_with_rag(question, paragraphs):
    qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")
    best_answer = {"score": 0, "paragraph": "", "answer": ""}

    for para in paragraphs:
        try:
            result = qa_pipeline(question=question, context=para)
            if result['score'] > best_answer['score']:
                best_answer = {"score": result['score'], "paragraph": para, "answer": result['answer']}
        except Exception as e:
            print(f"Error processing paragraph: {e}")
            continue

    return best_answer

# Main Function
if __name__ == "__main__":
    pdf_path = "example1.pdf"  # Replace with your PDF file path
    question = input("Enter your question: ")

    # Extract and process text
    print("\nExtracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    paragraphs = split_into_paragraphs(text)

    # Create embeddings and build FAISS index
    print("\nCreating embeddings and building FAISS index...")
    embeddings, embedding_model = create_paragraph_embeddings(paragraphs)
    faiss_index = build_faiss_index(embeddings)

    # Retrieve relevant paragraphs
    print("\nRetrieving relevant paragraphs...")
    relevant_paragraphs = retrieve_relevant_paragraphs(question, paragraphs, faiss_index, embedding_model)

    # Answer the question
    print("\nAnswering the question...")
    result = answer_question_with_rag(question, relevant_paragraphs)

    # Display the results
    if result['score'] > 0:
        print("\nAnswer:", result['answer'])
        print("\nRelevant Paragraph:", result['paragraph'])
    else:
        print("\nNo relevant answer found.")



Extracting text from PDF...

Creating embeddings and building FAISS index...

Retrieving relevant paragraphs...

Answering the question...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



Answer: Long Short-Term Memory

Relevant Paragraph: LSTM Long Short-Term Memory


In [4]:
# pdf_path = "example.pdf"  # Replace with your PDF file path
# question = input("Enter your question: ")

# # Extract and process text
# print("\nExtracting text from PDF...")
# text = extract_text_from_pdf(pdf_path)
# paragraphs = split_into_paragraphs(text)

# # Create embeddings and build FAISS index
# print("\nCreating embeddings and building FAISS index...")
# embeddings, embedding_model = create_paragraph_embeddings(paragraphs)
# faiss_index = build_faiss_index(embeddings)

# Retrieve relevant paragraphs
print("\nRetrieving relevant paragraphs...")
relevant_paragraphs = retrieve_relevant_paragraphs(question, paragraphs, faiss_index, embedding_model)

# Answer the question
print("\nAnswering the question...")
result = answer_question_with_rag(question, relevant_paragraphs)

# Display the results
if result['score'] > 0:
    print("\nAnswer:", result['answer'])
    print("\nRelevant Paragraph:", result['paragraph'])
else:
    print("\nNo relevant answer found.")


Extracting text from PDF...

Creating embeddings and building FAISS index...

Retrieving relevant paragraphs...

Answering the question...


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



Answer: to improve performance

Relevant Paragraph: to improve performance, such as making the RNN bidirectional and/or stateful.
