# Azure OpenAI Assistants - File Search

## Load Azure Configuration

In [9]:
from dotenv import load_dotenv
import os

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")
azure_openai_api_version = "2024-10-01-preview"

## Prepare Files

In [23]:
from openai import AzureOpenAI

# Create a client
client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)

# Create a vector store
vector_store = client.beta.vector_stores.create(name="Nasa Books")

# Specify the folder containing the files
folder_path = "../Data/nasabooks/"

# Get all file paths in the folder
file_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path)]

# Open file streams
file_streams = [open(path, "rb") for path in file_paths]

# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
    vector_store_id=vector_store.id, files=file_streams
)

In [27]:
# You can print the status and the file counts of the batch to see the result of this operation.
print(file_batch.status)
print(file_batch.file_counts)
print(vector_store.id)


completed
FileCounts(cancelled=0, completed=83, failed=1, in_progress=0, total=84)
vs_50clGqmf9ophWAj9JEBKtlV3


## Step 1-2:
1. Create an Assistant
2. Create a Thread

In [28]:
# Step 1: Create assistant
assistant = client.beta.assistants.create(
  name="Nasa books Assistant",
  instructions="""
  You are a assistant that provides information. 
   You will answer questions based on files provided to you about information in a NASA Book. 
   You will not provide answers outside of those files.
  """,
  model=azure_openai_deployment,
  tools=[{"type":"file_search"}],
  tool_resources={"file_search":{"vector_store_ids":[vector_store.id]}},
  temperature=1,
  top_p=1
)

# Step 2: Create thread
thread = client.beta.threads.create()
print(thread)

Thread(id='thread_rlvFiaGDEvyzlPU8zQG8LqzQ', created_at=1734252361, metadata={}, object='thread', tool_resources=ToolResources(code_interpreter=None, file_search=None))


## Step 3-6: 
3. Add a message to the thread
4. Run the Assistant
5. Check the Run Status
6. Display the Assistant's Response

In [29]:
import time

user_question ="""What can I see in the United States?"""

# Step 3: Add a message to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=user_question
)

# Step 4: Run the Assistant
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Step 5: Check the Run Status
# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

  if run.status == 'completed':
    messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  
  elif run.status == 'requires_action':
    pass
  
  else:
    print(run.status)

# Step 6: Display the Assistant's Response
content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)

in_progress
in_progress
in_progress
in_progress
in_progress
In the United States, various remarkable attractions and natural wonders can be explored:

1. **Grand Canyon**: One of the deepest canyons in the United States, the Grand Canyon is carved by the Colorado River and showcases over 2 billion years of geological history. John Wesley Powell, who led an expedition through the canyon, remarked on its indescribable beauty【4:1†source】.

2. **Green River**: The Bowknot Bend area of the Green River in eastern Utah is known for its unique river loop that carries rafters around a significant bend, showcasing natural beauty and geological phenomena【4:5†source】.

3. **Diverse Landscapes in Oregon**: Within a short drive in Oregon, visitors can experience a beach, a temperate rainforest, a mountain glacier, and a high desert, highlighting the state's varied ecosystems and climates【4:3†source】.

4. **Great Lakes**: The Great Lakes can experience extended ice seasons, as seen in 2014 when ice r

In [58]:
message_content = messages.data[0].content[0]
annotations = message_content.text.annotations
value = message_content.text.value
print(value)
print(annotations)

In the United States, various remarkable attractions and natural wonders can be explored:

1. **Grand Canyon**: One of the deepest canyons in the United States, the Grand Canyon is carved by the Colorado River and showcases over 2 billion years of geological history. John Wesley Powell, who led an expedition through the canyon, remarked on its indescribable beauty【4:1†source】.

2. **Green River**: The Bowknot Bend area of the Green River in eastern Utah is known for its unique river loop that carries rafters around a significant bend, showcasing natural beauty and geological phenomena【4:5†source】.

3. **Diverse Landscapes in Oregon**: Within a short drive in Oregon, visitors can experience a beach, a temperate rainforest, a mountain glacier, and a high desert, highlighting the state's varied ecosystems and climates【4:3†source】.

4. **Great Lakes**: The Great Lakes can experience extended ice seasons, as seen in 2014 when ice remained for nearly seven months, even as temperatures rose【4

In [73]:
# Extract the message content
message_content = messages.data[0].content[0]
annotations = message_content.text.annotations
citations = []

# Iterate over the annotations and add footnotes
for index, annotation in enumerate(annotations):
    # Replace the text with a footnote
    message_content.text.value = message_content.text.value.replace(annotation.text, f' [{index}]')

    # Gather citations based on annotation attributes
    if (file_citation := getattr(annotation, 'file_citation', None)):
        print(file_citation)
        cited_file = client.files.retrieve(file_citation.file_id)
        print(cited_file)
        #citations.append(f'[{index}] {file_citation.value} from {cited_file.filename}')
    elif (file_path := getattr(annotation, 'file_path', None)):
        cited_file = client.files.retrieve(file_path.file_id)
        citations.append(f'[{index}] Click <here> to download {cited_file.filename}')
        # Note: File download functionality not implemented above for brevity

# Add footnotes to the end of the message before displaying to user
message_content.text.value += '\n' + '\n'.join(citations)



FileCitation(file_id='assistant-aA9YlF7wv0fqVF88fcsmYQJI')
FileObject(id='assistant-aA9YlF7wv0fqVF88fcsmYQJI', bytes=318971, created_at=1734252016, filename='page-95.pdf', object='file', purpose='assistants', status='processed', status_details=None)
FileCitation(file_id='assistant-McQ6nBynbWCbbGHDrZUFoqEn')
FileObject(id='assistant-McQ6nBynbWCbbGHDrZUFoqEn', bytes=35462, created_at=1734252020, filename='page-169.pdf', object='file', purpose='assistants', status='processed', status_details=None)
FileCitation(file_id='assistant-Vq0VTWDLq7zSGIaBnWZJJOLW')
FileObject(id='assistant-Vq0VTWDLq7zSGIaBnWZJJOLW', bytes=318127, created_at=1734252024, filename='page-103.pdf', object='file', purpose='assistants', status='processed', status_details=None)
FileCitation(file_id='assistant-McQ6nBynbWCbbGHDrZUFoqEn')
FileObject(id='assistant-McQ6nBynbWCbbGHDrZUFoqEn', bytes=35462, created_at=1734252020, filename='page-169.pdf', object='file', purpose='assistants', status='processed', status_details=None)

In [64]:
cited_file = client.files.retrieve("assistant-Vq0VTWDLq7zSGIaBnWZJJOLW")
print(cited_file)

FileObject(id='assistant-Vq0VTWDLq7zSGIaBnWZJJOLW', bytes=318127, created_at=1734252024, filename='page-103.pdf', object='file', purpose='assistants', status='processed', status_details=None)


## Delete Assistant

In [18]:
response = client.beta.assistants.delete(assistant.id)