# Question-Answering Model Using OpenAI Embeddings

## Dependencies

In [1]:
%pip install -q openai
%pip install -q pypdf
%pip install -q tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.1/248.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import openai
import numpy as np
import pandas as pd
import pickle
import tiktoken
from pypdf import PdfReader
import re

In [3]:
from openai_api_key import API_KEY

openai.api_key = API_KEY


## Convert PDF to DataFrame
Takes a pdf document file and converts it to a DataFrame that Open.ai embedding model can process. 

In [4]:
def pdf_preprocessing(file_path):
    
    reader = PdfReader(file_path)

    text = ""
    for page in reader.pages:
        text += page.extract_text()
    
    pattern = re.compile(r'\n')
    text = pattern.sub("", text)
    
    subtexts = dict()

    i = 0
    index = 0
    if len(text) <= i+500:
        subtexts = {index: text}
    else:
        while len(text[i:]) != 0:
            if len(text[i:]) > 500:
                subtext = str(text[i:i+500])
                subtexts[index] = subtext
                i = i + 250
                index += 1
            else:
                subtext = str(text[i:])
                subtexts[index] = subtext
                break
    
    df = pd.DataFrame([(k, val) for k, val in subtexts.items()], columns=['index', 'content'])

    return df

### An Example Document about Construction Contracting Methods

In [5]:
file_path = '/content/Contracting Method.pdf'

df = pdf_preprocessing(file_path)

In [6]:
df.head()

Unnamed: 0,index,content
0,0,A2F05: Committee on Construction Managemen...
1,1,ge industry —particularly highway construction...
2,2,ave occurred in highway construction. The atm...
3,3,ON PRACTICES The highway sector is perhaps t...
4,4,. The traditional approach to contracti ng for...


In [7]:
len(df)

79

## Get Embedding of the Document
* "text-embedding-ada-002" is the embedding model that converts each DataFrame row into an embedding. 
* "text-davinci-003" is the completion model that will generate answers based on the the best matches (cosine similarities) between a question and context. 

In [8]:
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [9]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[list[str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

In [10]:
document_embeddings = compute_doc_embeddings(df)

In [11]:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

0 : [0.012909317389130592, 0.0041723293252289295, -0.008508413098752499, -0.010180074721574783, 0.0017373334849253297]... (1536 entries)


## Comparing Similarities between Document and Query

In [12]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

### Example of Document Similarities by Ranks (Highest 5)

In [13]:
order_document_sections_by_query_similarity("What is highly innovative approach to contractor prequalification?", document_embeddings)[:5]


[(0.8675996236852721, 62),
 (0.8460088441999218, 61),
 (0.839840447955439, 59),
 (0.8380562998163616, 60),
 (0.8259830387346908, 72)]

### Constructing Prompt with Sections with Highest Similarities

In [14]:
MAX_SECTION_LEN = 1050
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [15]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        print(document_section)

        chosen_sections_len += len(document_section.content) + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

### Prompt Example

In [16]:
prompt = construct_prompt(
    "What is highly innovative approach to contractor prequalification?",
    document_embeddings,
    df
)

print("===\n", prompt)

index                                                     62
content    ally been discussed and not enforced. The emph...
Name: 62, dtype: object
index                                                     61
content    tractors must comply for prequalification, ess...
Name: 61, dtype: object
index                                                     59
content    ors. Variations on design–build, such as desig...
Name: 59, dtype: object
Selected 2 document sections:
62
61
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* ally been discussed and not enforced. The emphasis on quality performance, however, is now becoming a major factor in evaluation. The Ontario Ministry of Transportation has begun using a highly innovative approach to contractor prequalification to improve the quality of performance and reduce infractions on its projects. This approach involves evaluating

### Use OpenAI's Completion Model to Get the Answer

In [17]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [18]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return print("===\nANSWER: " + response["choices"][0]["text"].strip(" \n"))

### Let's Get the Answers!

In [19]:
answer_query_with_context("What is highly innovative approach to contractor prequalification?", df, document_embeddings)

index                                                     62
content    ally been discussed and not enforced. The emph...
Name: 62, dtype: object
index                                                     61
content    tractors must comply for prequalification, ess...
Name: 61, dtype: object
index                                                     59
content    ors. Variations on design–build, such as desig...
Name: 59, dtype: object
Selected 2 document sections:
62
61
===
ANSWER: The Ontario Ministry of Transportation has begun using a highly innovative approach to contractor prequalification to improve the quality of performance and reduce infractions on its projects. This approach involves evaluating the contractor in four areas: quality, safety, timeliness, and contract execution. Each area is given a different weight in the determination of prequalification.


In [None]:
answer_query_with_context("What are the future challenges?", df, document_embeddings)

In [None]:
answer_query_with_context("Why is it so difficult to find qualified contractors?", df, document_embeddings)