### Setup

In [None]:
!pip install PyPDF2 pandas tqdm -q

In [None]:
!pip install openai==1.66.3

In [None]:
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import concurrent
import PyPDF2
import os
import pandas as pd
import base64

In [None]:
from google.colab import userdata


In [None]:
client = OpenAI(api_key=userdata.get('openai'))
dir_pdfs = 'openai_blog_pdfs' # have those PDFs stored locally here
pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]

In [None]:
print(pdf_files)

['openai_blog_pdfs/The court rejects Elon’s latest attempt to slow OpenAI down _ OpenAI.pdf', 'openai_blog_pdfs/Introducing deep research _ OpenAI.pdf', 'openai_blog_pdfs/New tools for building agents _ OpenAI.pdf', 'openai_blog_pdfs/OpenAI GPT-4.5 System Card _ OpenAI.pdf', 'openai_blog_pdfs/Introducing Operator _ OpenAI.pdf']


### Creating Vector Store with our PDFs

- Create a Vector Store on OpenAI's servers
- Upload files to the vector store

In [None]:
def create_vector_store(store_name: str) -> dict:
    try:
        vector_store = client.vector_stores.create(name=store_name)
        details = {
            "id": vector_store.id,
            "name": vector_store.name,
            "created_at": vector_store.created_at,
            "file_count": vector_store.file_counts.completed
        }
        print("Vector store created:", details)
        return details
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return {}

In [None]:
store_name = "openai_blog_store"
vector_store_details = create_vector_store(store_name)

Vector store created: {'id': 'vs_67d5f672b158819189c250187c7b5eb2', 'name': 'openai_blog_store', 'created_at': 1742075506, 'file_count': 0}


In [None]:
def upload_single_pdf(file_path: str, vector_store_id: str):
    file_name = os.path.basename(file_path)
    try:
        file_response = client.files.create(file=open(file_path, 'rb'), purpose="assistants")
        attach_response = client.vector_stores.files.create(
            vector_store_id=vector_store_id,
            file_id=file_response.id
        )
        return {"file": file_name, "status": "success"}
    except Exception as e:
        print(f"Error with {file_name}: {str(e)}")
        return {"file": file_name, "status": "failed", "error": str(e)}

In [None]:
def upload_pdf_files_to_vector_store(vector_store_id: str):
    pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]
    stats = {"total_files": len(pdf_files), "successful_uploads": 0, "failed_uploads": 0, "errors": []}

    print(f"{len(pdf_files)} PDF files to process. Uploading in parallel...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(upload_single_pdf, file_path, vector_store_id): file_path for file_path in pdf_files}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(pdf_files)):
            result = future.result()
            if result["status"] == "success":
                stats["successful_uploads"] += 1
            else:
                stats["failed_uploads"] += 1
                stats["errors"].append(result)

    return stats

In [None]:
# upload all files to the vector-store
upload_pdf_files_to_vector_store(vector_store_details["id"])

5 PDF files to process. Uploading in parallel...


100%|██████████| 5/5 [00:01<00:00,  2.81it/s]


{'total_files': 5, 'successful_uploads': 5, 'failed_uploads': 0, 'errors': []}

### Standalone vector search

Now that our vector store is ready, we are able to query the Vector Store directly and retrieve relevant content for a specific query. Using the new vector search API, we're able to find relevant items from our knowledge base without necessarily integrating it in an LLM query.

In [None]:
query = "What's Deep Research?"

search_results = client.vector_stores.search(
    vector_store_id=vector_store_details['id'],
    query=query
)

In [None]:
for result in search_results.data:
    print(str(len(result.content[0].text)) + ' of character of content from ' + result.filename + ' with a relevant score of ' + str(result.score))

3484 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.9770935017954946
3516 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.9590194714345182
3260 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.9427399909585098
3620 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.9249127384831187
3332 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.890895622820891
3474 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.8835218476424277
3376 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.8135896906162389
2772 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.7170967693481712
3183 of character of content from Introducing deep research _ Ope

In [None]:
print(search_results.data[0].content[0].text)

Introducing deep research | OpenAI


February 2, 2025 Release

Introducing deep research
An agent that uses reasoning to synthesize large amounts of
online information and complete multi-step research tasks
for you. Available to Pro users today, Plus and Team next.

Try on ChatGPT

Listen to article 8:56 Share

3/15/25, 2:25 PM Introducing deep research | OpenAI

https://openai.com/index/introducing-deep-research/ 1/38

https://openai.com/research/index/release/
https://chatgpt.com/
https://openai.com/


Today we’re launching deep research in ChatGPT, a new agentic capability that conducts
multi-step research on the internet for complex tasks. It accomplishes in tens of minutes
what would take a human many hours.

Deep research is OpenAI's next agent that can do work for you independently—you give it
a prompt, and ChatGPT will find, analyze, and synthesize hundreds of online sources to
create a comprehensive report at the level of a research analyst. Powered by a version of
the upcomin

In [None]:
query = "What did the court say about Elon's case?"

search_results = client.vector_stores.search(
    vector_store_id=vector_store_details['id'],
    query=query
)

In [None]:
for result in search_results.data:
    print(str(len(result.content[0].text)) + ' of character of content from ' + result.filename + ' with a relevant score of ' + str(result.score))

3065 of character of content from The court rejects Elon’s latest attempt to slow OpenAI down _ OpenAI.pdf with a relevant score of 0.9919459146628226
2340 of character of content from The court rejects Elon’s latest attempt to slow OpenAI down _ OpenAI.pdf with a relevant score of 0.9354843499078509
2624 of character of content from The court rejects Elon’s latest attempt to slow OpenAI down _ OpenAI.pdf with a relevant score of 0.8562886886419987
2919 of character of content from OpenAI GPT-4.5 System Card _ OpenAI.pdf with a relevant score of 0.018677348264400928
3183 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.004278511378690768
2739 of character of content from Introducing deep research _ OpenAI.pdf with a relevant score of 0.004035565093116385
2673 of character of content from Introducing Operator _ OpenAI.pdf with a relevant score of 0.003124790134669153
2601 of character of content from Introducing deep research _ OpenAI.pdf wi

### Integrating search results with LLM in a single API call

In [None]:
query = "What's Deep Research?"
response = client.responses.create(
    input= query,
    model="gpt-4o-mini",
    tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_store_details['id']],
    }]
)


In [None]:
# Extract annotations from the response
annotations = response.output[1].content[0].annotations

In [None]:
annotations

[AnnotationFileCitation(file_id='file-F6LDdDbrP3ydpS12qYtNZM', index=512, type='file_citation', filename='Introducing deep research _ OpenAI.pdf'),
 AnnotationFileCitation(file_id='file-F6LDdDbrP3ydpS12qYtNZM', index=911, type='file_citation', filename='Introducing deep research _ OpenAI.pdf'),
 AnnotationFileCitation(file_id='file-F6LDdDbrP3ydpS12qYtNZM', index=1080, type='file_citation', filename='Introducing deep research _ OpenAI.pdf'),
 AnnotationFileCitation(file_id='file-F6LDdDbrP3ydpS12qYtNZM', index=1080, type='file_citation', filename='Introducing deep research _ OpenAI.pdf'),
 AnnotationFileCitation(file_id='file-F6LDdDbrP3ydpS12qYtNZM', index=1248, type='file_citation', filename='Introducing deep research _ OpenAI.pdf'),
 AnnotationFileCitation(file_id='file-F6LDdDbrP3ydpS12qYtNZM', index=1248, type='file_citation', filename='Introducing deep research _ OpenAI.pdf'),
 AnnotationFileCitation(file_id='file-F6LDdDbrP3ydpS12qYtNZM', index=1431, type='file_citation', filename='I

In [None]:
# Get top-k retrieved filenames
retrieved_files = set([result.filename for result in annotations])

print(f'Files used: {retrieved_files}')
print('Response:')
print(response.output[1].content[0].text) # 0 being the filesearch call

Files used: {'Introducing deep research _ OpenAI.pdf'}
Response:
Deep Research is a newly introduced feature by OpenAI that enables users to conduct complex, multi-step research tasks on the internet through ChatGPT. It synthesizes information from various online sources to create comprehensive reports, effectively operating at the level of a research analyst. This capability is especially targeted toward professionals engaged in intensive knowledge work, such as those in finance, science, policy, and engineering, as well as consumers seeking personalized recommendations.

The core functionalities of Deep Research include:

1. **Multi-Step Reasoning**: It can find, analyze, and synthesize large amounts of data quickly, taking significantly less time than a human would require.
2. **Documented Outputs**: Every report generated includes citations and a summary of the reasoning behind the conclusions, making it easy for users to verify and reference the information.
3. **Customization and

In [None]:
def get_response_from_vectorsotre(query: str):
  response = client.responses.create(
      input= query,
      model="gpt-4o-mini",
      tools=[{
          "type": "file_search",
          "vector_store_ids": [vector_store_details['id']],
      }]
  )

  # Extract annotations from the response
  annotations = response.output[1].content[0].annotations

  # Get top-k retrieved filenames
  retrieved_files = set([result.filename for result in annotations])

  print(f'Files used: {retrieved_files}')
  print('Response:')
  print(response.output[1].content[0].text) # 0 being the filesearch call


In [None]:
query="What is Deep Researc and What was the court's verdict on Elon's case?"
get_response_from_vectorsotre(query=query)

Files used: {'The court rejects Elon’s latest attempt to slow OpenAI down _ OpenAI.pdf', 'Introducing deep research _ OpenAI.pdf'}
Response:
### Deep Research
Deep Research is a capability launched by OpenAI that enables users to conduct multi-step research tasks by synthesizing large amounts of information from the web. It is designed to perform complex inquiries much faster than a human could, effectively acting like a research analyst. The system uses advanced reasoning to gather, analyze, and represent data in a comprehensive report, including citations for easy reference. It particularly excels in industries such as finance, science, and policy, providing detailed outputs that are fully documented.

### Court's Verdict on Elon Musk's Case
In the case involving Elon Musk and OpenAI, the court rejected Musk's request for a preliminary injunction, finding that he had not demonstrated a likelihood of success on the merits of his claims. Furthermore, the court dismissed several of Musk

## Evaluating performance

### Generating questions

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def generate_questions(pdf_path):
    text = extract_text_from_pdf(pdf_path)

    prompt = (
        "Can you generate a question that can only be answered from this document?:\n"
        f"{text}\n\n"
    )

    response = client.responses.create(
        input=prompt,
        model="gpt-4o",
    )

    question = response.output[0].content[0].text

    return question

In [None]:
generate_questions(pdf_files[0])

"What was the outcome of Elon Musk's request for a preliminary injunction against OpenAI, as mentioned in the document?"

In [None]:
# Generate questions for each PDF and store in a dictionary
questions_dict = {}
for pdf_path in pdf_files:
    questions = generate_questions(pdf_path)
    questions_dict[os.path.basename(pdf_path)] = questions

In [None]:
questions_dict

{'The court rejects Elon’s latest attempt to slow OpenAI down _ OpenAI.pdf': "What was the court's decision regarding Elon Musk's request for a preliminary injunction against OpenAI, as mentioned in the document?",
 'Introducing deep research _ OpenAI.pdf': "What percentage accuracy did the model powering Deep Research achieve on Humanity's Last Exam, and how does this compare to other models mentioned?",
 'New tools for building agents _ OpenAI.pdf': 'What is the purpose of the new Responses API introduced by OpenAI, and how does it differ from the previous Chat Completions and Assistants APIs?',
 'OpenAI GPT-4.5 System Card _ OpenAI.pdf': 'What is the preparedness score for cybersecurity mentioned in the GPT-4.5 system card?',
 'Introducing Operator _ OpenAI.pdf': "Certainly! Here's a question that can only be answered using the document provided:\n\nWhat is the name of the new model that powers the Operator and combines vision capabilities with advanced reasoning?"}

In [None]:
rows = []
for filename, query in questions_dict.items():
    rows.append({"query": query, "_id": filename.replace(".pdf", "")})

# Metrics evaluation parameters
k = 5
total_queries = len(rows)
correct_retrievals_at_k = 0
reciprocal_ranks = []
average_precisions = []

def process_query(row):
    query = row['query']
    expected_filename = row['_id'] + '.pdf'
    # Call file_search via Responses API
    response = client.responses.create(
        input=query,
        model="gpt-4o-mini",
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_details['id']],
            "max_num_results": k,
        }],
        tool_choice="required" # it will force the file_search, while not necessary, it's better to enforce it as this is what we're testing
    )
    # Extract annotations from the response
    annotations = None
    if hasattr(response.output[1], 'content') and response.output[1].content:
        annotations = response.output[1].content[0].annotations
    elif hasattr(response.output[1], 'annotations'):
        annotations = response.output[1].annotations

    if annotations is None:
        print(f"No annotations for query: {query}")
        return False, 0, 0

    # Get top-k retrieved filenames
    retrieved_files = [result.filename for result in annotations[:k]]
    if expected_filename in retrieved_files:
        rank = retrieved_files.index(expected_filename) + 1
        rr = 1 / rank
        correct = True
    else:
        rr = 0
        correct = False

    # Calculate Average Precision
    precisions = []
    num_relevant = 0
    for i, fname in enumerate(retrieved_files):
        if fname == expected_filename:
            num_relevant += 1
            precisions.append(num_relevant / (i + 1))
    avg_precision = sum(precisions) / len(precisions) if precisions else 0

    if expected_filename not in retrieved_files:
        print("Expected file NOT found in the retrieved files!")

    if retrieved_files and retrieved_files[0] != expected_filename:
        print(f"Query: {query}")
        print(f"Expected file: {expected_filename}")
        print(f"First retrieved file: {retrieved_files[0]}")
        print(f"Retrieved files: {retrieved_files}")
        print("-" * 50)


    return correct, rr, avg_precision

In [None]:
process_query(rows[0])



(True, 1.0, 1.0)

In [None]:
with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_query, rows), total=total_queries))

correct_retrievals_at_k = 0
reciprocal_ranks = []
average_precisions = []

for correct, rr, avg_precision in results:
    if correct:
        correct_retrievals_at_k += 1
    reciprocal_ranks.append(rr)
    average_precisions.append(avg_precision)

recall_at_k = correct_retrievals_at_k / total_queries
precision_at_k = recall_at_k  # In this context, same as recall
mrr = sum(reciprocal_ranks) / total_queries
map_score = sum(average_precisions) / total_queries

100%|██████████| 5/5 [00:08<00:00,  1.64s/it]


In [None]:
# Print the metrics with k
print(f"Metrics at k={k}:")
print(f"Recall@{k}: {recall_at_k:.4f}")
print(f"Precision@{k}: {precision_at_k:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Mean Average Precision (MAP): {map_score:.4f}")

Metrics at k=5:
Recall@5: 1.0000
Precision@5: 1.0000
Mean Reciprocal Rank (MRR): 1.0000
Mean Average Precision (MAP): 1.0000
