In [1]:
!pip install pymupdf openai==0.28

Collecting pymupdf
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyMuPDFb, pymupdf, openai
Successfully installed PyMuPDFb-1.24.3 openai-0.28.0 pymupdf-1.24.5


In [2]:
##My approach

##First, we use OpenAI's embedding model to create word embeddings from the text extracted from a PDF. When a user asks a question, we identify text chunks that closely match the query using cosine similarity. These relevant chunks are then fed as context to OpenAI's GPT model, which generates the answers based on this context. This process supports continuous queries, storing all Q&A pairs in a JSON file upon exit.

In [3]:
import os
import json
import numpy as np
import fitz
import openai
from typing import List, Dict, Tuple

def extract_text_from_pdf(file_path: str):
    text = ""
    pdf_document = fitz.open(file_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        text += page.get_text()
    return text.replace("\n", " ")

def chunk_text(text: str, chunk_size: int = 128, chunk_overlap: int = 20):
    words = text.split(" ")
    words = [word for word in words if word]
    chunks = []
    for i in range(0, len(words), chunk_size - chunk_overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def get_embedding(text: str, api_key: str, model: str = "text-embedding-ada-002"):
    openai.api_key = api_key
    text = text.replace("\n", " ")
    response = openai.Embedding.create(input=[text], model=model)
    return np.array(response["data"][0]["embedding"])

def index_document(chunks: List[str], api_key: str, model: str = "text-embedding-ada-002"):
    embeddings = []
    for chunk in chunks:
        embedding = get_embedding(chunk, api_key, model)
        embeddings.append(embedding)
    return chunks, np.vstack(embeddings)

def cosine_similarity(query_embedding: np.ndarray, embeddings_matrix: np.ndarray):
    norms = np.linalg.norm(embeddings_matrix, axis=1) * np.linalg.norm(query_embedding)
    similarities = np.dot(embeddings_matrix, query_embedding) / norms
    return similarities

def get_top_k_chunks(query: str, indexed_chunks: List[str], embeddings_matrix: np.ndarray, api_key: str, model: str, k: int = 5):
    query_embedding = get_embedding(query, api_key, model)
    similarities = cosine_similarity(query_embedding, embeddings_matrix)
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    return [indexed_chunks[i] for i in top_k_indices]

def generate_answer(query: str, context: str, api_key: str, model: str = "gpt-3.5-turbo"):
    openai.api_key = api_key
    prompt = f"Context:\n{context}\n\nAnswer the question based on the above context:\n{query}"

    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
    )
    return response.choices[0].message["content"]

def doc_qa_from_pdf(file_path: str, api_key: str, embedding_model: str = "text-embedding-ada-002", completion_model: str = "gpt-3.5-turbo"):
    text = extract_text_from_pdf(file_path)
    chunks = chunk_text(text)
    indexed_chunks, embeddings_matrix = index_document(chunks, api_key, embedding_model)
    return {"indexed_chunks": indexed_chunks, "embeddings_matrix": embeddings_matrix, "api_key": api_key, "embedding_model": embedding_model, "completion_model": completion_model}

def answer_question(docqa: Dict, question: str):
    top_chunks = get_top_k_chunks(question, docqa["indexed_chunks"], docqa["embeddings_matrix"], docqa["api_key"], docqa["embedding_model"])
    context = "\n".join(top_chunks)
    return generate_answer(question, context, docqa["api_key"], docqa["completion_model"])

def main_loop(file_path: str, api_key: str, embedding_model: str, completion_model: str):
    docqa = doc_qa_from_pdf(file_path, api_key, embedding_model, completion_model)
    qa_pairs = []

    while True:
        question = input("Please type your question (or 'exit' to finish): ")
        if question.lower() == "exit":
            break
        answer = answer_question(docqa, question)
        print(f"Answer: {answer}")
        qa_pairs.append({"question": question, "answer": answer , "model":completion_model})

    with open("qa_pairs.json", "w") as f:
        json.dump(qa_pairs, f, indent=4)
    print("All questions and answers have been saved to qa_pairs.json")


##As this is modular, you can try multiple models. Just ensure to switch it up with your API key

##To exit the QA session type "exit" in the prompt ,

##Finally a json file is generated with the Question-Answer pair and the model name

main_loop(file_path="GenAI_Handbook.pdf",
          api_key = "",
          embedding_model = "text-embedding-ada-002",
          completion_model = "gpt-3.5-turbo")


Please type your question (or 'exit' to finish): What is prompt management? why do we need it?
Answer: Prompt management is the process of storing and testing multiple prompts in order to simplify the prompt evaluation process. It is essential because developers often need to evaluate and make changes to prompts multiple times during the development and testing phases. Without a prompt management process in place, developers may have to recreate past prompts, leading to increased development time and potential errors. By implementing a prompt management process, developers can streamline the testing and evaluation processes, allowing for easier comparison of multiple prompts and variations.
Please type your question (or 'exit' to finish): exit
All questions and answers have been saved to qa_pairs.json
