In [None]:
import openai
import chromadb
import os
import textract
import tiktoken
import math
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [None]:
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
BOOK_FILE = "Downloads/quantum.html"
CHUNK_SIZE = 400
NUM_DOCUMENTS = 4

In [None]:
QUERY = "What are the philosophical implications of quantum mechanics?"

In [None]:
# os.environ["OPENAI_API_KEY"] = "sk-xxx"
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
# 1) read the text from the file
text = textract.process(BOOK_FILE).decode()

In [None]:
# 2) split the text into tokens
encoding = tiktoken.encoding_for_model(GPT_MODEL)
tokens = encoding.encode(text)

In [None]:
# 3) split the tokens into chunks
num_chunks = math.ceil(len(tokens) / CHUNK_SIZE)
token_chunks = [tokens[i*CHUNK_SIZE:min((i+1)*CHUNK_SIZE, len(tokens))] for i in range(0, num_chunks)]

In [None]:
# 4) convert the token chunks into documents
documents = [encoding.decode(chunk) for chunk in token_chunks]

In [None]:
# 5) transform the documents into embeddings
embeddings = [openai.Embedding.create(input=doc, model=EMBEDDING_MODEL)["data"][0]["embedding"] for doc in documents]

In [None]:
# 6) create a new document for each embedded text
embedding_function = OpenAIEmbeddingFunction(openai.api_key, EMBEDDING_MODEL)
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name='collection', embedding_function=embedding_function)
collection.add(
    documents=documents,
    embeddings=embeddings,
    ids=[str(i) for i in range(len(documents))],
)

In [None]:
# 7) convert the query into embedding
query_embedding = openai.Embedding.create(input=QUERY, model=EMBEDDING_MODEL)["data"][0]["embedding"]

In [None]:
# 8) search the collection for the most similar embeddings
query_result = collection.query(query_embeddings=query_embedding, n_results=NUM_DOCUMENTS)

In [None]:
# 9) return the most similar documents in text form
most_similar_documents = query_result["documents"][0]

In [None]:
# 10) give ChatGPT the most similar documents and the query
documents_context = "\n".join(most_similar_documents)
system_message = f'''
You have been given pieces of text from a larger file named {BOOK_FILE}.
You will use the following information from the contents of the file to answer the user's question. The information is contained within triple backticks.
```{documents_context}```
'''
response = openai.ChatCompletion.create(
    model=GPT_MODEL,
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"Here is my question: {QUERY}"},
    ],
    temperature=0,
)

In [None]:
answer = response.choices[0]["message"]["content"]
print(answer)