<a href="https://colab.research.google.com/github/romenlaw/llm_playground/blob/main/pdf_playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install pymupdf sentence-transformers faiss-cpu

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==

In [4]:
from google.colab import userdata
openai_key = userdata.get('openai_key')

In [20]:
import fitz  # PyMuPDF for reading PDFs
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import openai

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to extract and chunk text from a PDF
def extract_and_chunk_pdf(pdf_path, chunk_size=500):
    doc = fitz.open(pdf_path)
    chunks = []
    for page in doc:
        text = page.get_text("text")
        for i in range(0, len(text), chunk_size):
            chunks.append(text[i:i+chunk_size])
    return chunks

# Function to generate embeddings for text chunks
def embed_chunks(chunks):
    embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
    return np.array(embeddings)

# Function to create FAISS index for similarity search
def create_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance for similarity
    index.add(embeddings)  # Add embeddings to index
    return index

# Function to find relevant chunk based on query
def search_chunks(query, chunks, index, embeddings):
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    _, indices = index.search(query_embedding, k=3)  # Retrieve top 3 relevant chunks
    return [chunks[i] for i in indices[0]]

# Function to query AI model based on retrieved text
def ask_ai_about_pdf(relevant_chunks, question, api_key):
    content = "\n".join(relevant_chunks)
    client = openai.OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an AI that answers questions based on a provided document."},
            {"role": "user", "content": f"Document Excerpt: {content}"},
            {"role": "user", "content": f"Question: {question}"}
        ]
    )
    return response.choices[0].message.content


In [None]:
pdf_path = "./book5.pdf"  # Replace with your PDF file
chunks = extract_and_chunk_pdf(pdf_path)
embeddings = embed_chunks(chunks)
index = create_faiss_index(embeddings)

client = openai.OpenAI(api_key=api_key)

In [22]:

query = "Did John Snow survive in the end?"
relevant_chunks = search_chunks(query, chunks, index, embeddings)

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are an AI that answers questions based on a provided document."},
        {"role": "user", "content": f"Document Excerpt: {relevant_chunks}"},
        {"role": "user", "content": f"Question: {query}"}
    ]
)

In [23]:
response.choices[0].message.content

'The provided document excerpt does not explicitly state whether Jon Snow survives in the end. To answer this question, more context from the specific book or series would be needed. If this question pertains to his storyline in "A Song of Ice and Fire" or the TV series "Game of Thrones," additional information would be necessary to provide a comprehensive answer.'

In [34]:
# Example Usage

query = "Did she get married?"
relevant_chunks = search_chunks(query, chunks, index, embeddings)

api_key = openai_key  # Replace with your OpenAI API key
answer = ask_ai_about_pdf(relevant_chunks, query, api_key)
print("Answer:", answer)

Answer: Yes, Daenerys (referred to as "Dany" in the excerpt) did get married. The text mentions her being a "woman wed" and having a "lord husband," which indicates that she is married, presumably to Hizdahr, as mentioned in the document.
