# Imports and init

In [3]:
import openai
import numpy as np
from getpass import getpass
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import yaml

In [None]:

# Download the NLTK Punkt tokenizer
nltk.download('punkt')


In [4]:
with open('/Users/dstone/.config/autogpt/credentials.yml', 'r') as f:
    yml = yaml.safe_load(f)

In [5]:

# Set up the OpenAI API client
openai.api_key = yml['openai']['api_key']

## Defintions

In [29]:
# def get_embeddings(prompt, model="text-embedding-ada-002"):
#     completions = openai.Completion.create(
#         engine=model,
#         prompt=prompt,
#         n=1,
#         max_tokens=3,
#         temperature=0,
#     )

#     # Extract the embeddings
#     embeddings = completions["choices"][0]["metadata"]["model"]["embedding"]

#     return np.array(embeddings)

def get_embedding(text, model="text-embedding-ada-002") -> np.array:
   text = text.replace("\n", " ")
   emb = openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
   return np.asarray(emb)


In [21]:
def build_semantic_index(paragraphs):
    semantic_index = []

    for paragraph in paragraphs:
        # Obtain the paragraph embeddings
        embedding = get_embedding(paragraph)

        # Add the paragraph and its embedding to the index
        semantic_index.append((paragraph, embedding))

    return semantic_index

In [None]:
def find_relevant_paragraphs(question, semantic_index, top_n=3):
    question_embedding = get_embedding(question)

    # Calculate the cosine similarity between the question and paragraph embeddings
    similarities = [cosine_similarity(question_embedding.reshape(1, -1), emb.reshape(1, -1)) for _, emb in semantic_index]

    # Get the indices of the top_n most relevant paragraphs
    top_indices = np.argsort(similarities, axis=0)[::-1][:top_n].flatten()

    # Return the most relevant paragraphs
    return [semantic_index[i][0] for i in top_indices]

In [None]:
model_engine = "gpt-3.5-turbo"

In [94]:
def ask_with_context(question, relevant_paragraphs, model="gpt-3.5-turbo", tokens_limit=200, previous_questions: list=[], previous_responses: list=[]):
    context = "\n".join(relevant_paragraphs)
    prompt = f"\n\nQ: {question}\nA: "

    # TODO: add in previous context
    # example (from https://platform.openai.com/docs/guides/chat/introduction)
    # messages=[
    #     {"role": "system", "content": "You are a helpful assistant."},
    #     {"role": "user", "content": "Who won the world series in 2020?"},
    #     {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
    #     {"role": "user", "content": "Where was it played?"}
    # ]
    # you can fill in the assistant with previous questions and answers to continue the conversation
    # messages = []
    # if len(previous_responses) > 0:
    #     pass
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{'role': 'system', 'content': context},
                  {'role': 'user', 'content': prompt}],
        max_tokens=tokens_limit,
        temperature=0.5, # make as deterministic as possible; I just want a number
    )

    answer = response['choices'][0]['message']['content']
    return answer

In [68]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""

        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()

    return text

# Load texts and embed

## Load text from PDF

In [55]:
import PyPDF2
from glob import glob
from pathlib import Path

In [69]:
pdf_directory = '/Users/dstone/Dropbox/papers/space/terraforming/'

pdf_texts = []
for file_name in glob(f'{pdf_directory.rstrip("/")}/*.pdf'):
    pdf_path = Path(file_name).absolute()
    pdf_text = extract_text_from_pdf(pdf_path)
    pdf_texts.append(pdf_text)


In [71]:
all_paragraphs = []
for pdf_text in pdf_texts:
    paragraphs = nltk.tokenize.sent_tokenize(pdf_text)
    all_paragraphs.extend(paragraphs)

## Embed

In [76]:
# takes time-- calling OpenAI API for every tokenized sentence in texts
semantic_index = build_semantic_index(all_paragraphs)

In [79]:
question = "What can you summarize about the role of Nitrogen in terraforming Mars?"
rel_paragraphs = find_relevant_paragraphs(question=question, semantic_index=semantic_index, top_n=20)

In [None]:
response = ask_with_context(question=question, relevant_paragraphs=rel_paragraphs)

In [92]:
print('\n'.join(nltk.tokenize.sent_tokenize(response)))

Nitrogen is a key factor in the feasibility of terraforming Mars for human habitability.
The amount of nitrogen needed to create a breathable atmosphere on Mars is very large, and it is unlikely that the current atmospheric nitrogen is enough.
It is possible that nitrogen is tied up as nitrate in the regolith and subsurface, but more research is needed to assess the amount and location of nitrates on Mars.
Without enough nitrogen, it is not within near-term capabilities of humans to bring it to Mars.
However, nitrogen is unlikely to be limiting for a plant-dominated biosphere.


# Scratch

In [50]:
question_embedding = get_embedding(question)

# Calculate the cosine similarity between the question and paragraph embeddings
similarities = [cosine_similarity(question_embedding.reshape(1, -1), emb.reshape(1, -1)) for _, emb in semantic_index]

# Get the indices of the top_n most relevant paragraphs
top_indices = np.argsort(similarities, axis=0)[::-1][:3].flatten()

In [49]:
cosine_similarity(question_embedding.reshape(1, -1), semantic_index[0][1].reshape(1, -1))

array([[0.77780824]])

In [39]:
similarities[0][0]

array([-1.,  1.,  1., ..., -1., -1., -1.])

In [25]:
question = "Your question here"

# Find the most relevant paragraphs for the question
relevant_paragraphs = find_relevant_paragraphs(question, semantic_index)

# Ask the question with the relevant context
answer = ask_with_context(question, relevant_paragraphs)
print(f"Q: {question}\nA: {answer}")


AttributeError: 'list' object has no attribute 'reshape'

In [8]:
paragraphs

['\nYour large text corpus goes here.']