In [21]:
from openai import OpenAI
import numpy as np
from dotenv import load_dotenv

In [22]:
load_dotenv()
client = OpenAI()

In [31]:
def get_embedding(texts, model='text-embedding-3-small'):
    """Convert text to embedding vectors"""

    if isinstance(texts, str):
        texts = [texts]

    response = client.embeddings.create(input=texts, model=model)
    return np.array([item.embedding for item in response.data])

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
sentences = [
    "The cat is sleeping on the couch",
    "A kitten is playing with a toy",
    "The dog is running in the park"
]

embeddings = get_embedding(sentences)

cat_kitten = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
cat_dog = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]

print(f"Cat Kitten: {cat_kitten: .3f}")
print(f"Cat Dog: {cat_dog: .3f}")

Cat Kitten:  0.385
Cat Dog:  0.200


In [33]:
def fixed_length_chunking(text, chunk_size=500, overlap=50):
    """Split text into fixed-length chunks."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
            start = end - overlap if end < len(text) else end  #A
    return chunks

In [34]:
sample = "A" * 283
chunks = fixed_length_chunking(sample, chunk_size=100, overlap=20)
print(f"Original: {len(sample)} chars → {len(chunks)} chunks")

Original: 283 chars → 4 chunks


In [52]:
def vector_search(query, chunks, chunk_embeddings, top_k=3):
    """Find the most similar chunks to the query."""
    assert len(chunks) == len(chunk_embeddings), \
        f"Mismatch: chunks={len(chunks)} embeddings={len(chunk_embeddings)}"

    query_embedding = get_embedding(query)
    similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
    top_indices = similarities.argsort()[::-1][:top_k]

    results = []
    for idx in top_indices:
        results.append({
            'chunk': chunks[idx],
            'similarity': similarities[idx],
        })

    return results

In [48]:
documents = [
    "Python is a programming language",
    "Machine learning uses Python extensively",
    "Cats are popular pets",
    "Deep learning is a subset of machine learning"
]

doc_embeddings = get_embedding(documents)

results = vector_search("Artificial Intelligence", documents, doc_embeddings,
                        top_k=4)
for r in results:
    print(f"{r['similarity']:.3f}: {r['chunk']}")

ValueError: Found array with dim 3, while dim <= 2 is required by check_pairwise_arrays.

In [37]:
from tavily import TavilyClient

tavily = TavilyClient()
response = tavily.search(
    "2025 Nobel Prize winners",
    max_results=10,
    include_raw_content=True
)
search_results = []
for result in response['results']:
    if result.get('raw_content'):
        search_results.append({
            'title': result['title'],
            'content': result['raw_content'],
            'url': result['url']
        })

In [38]:
import tiktoken

enc = tiktoken.encoding_for_model("gpt-5")
full_text = "\n\n".join([
    f"Title: {r['title']}\n{r['content']}"
    for r in search_results
])
total_tokens = len(enc.encode(full_text))
print(f"Total characters: {len(full_text)}")
print(f"Total tokens: {total_tokens}")

Total characters: 70182
Total tokens: 18535


In [49]:
all_chunks = []

for result in search_results:
    text = f"Title: {result['title']}\n{result['content']}"
    chunks = fixed_length_chunking(text, chunk_size=500, overlap=50)

    for chunk in chunks:
        all_chunks.append({
            'text': chunk,
            'title': result['title'],
            'url': result['url']
        })

print(f"Total chunks: {len(all_chunks)}")
chunk_texts = [c['text'] for c in all_chunks]
chunk_embeddings = get_embedding(chunk_texts)

Total chunks: 161


In [53]:
query = "quantum computing"
results = vector_search(query, chunk_texts, chunk_embeddings, top_k=3)
print(f"Query: '{query}'\n")
print("=" * 60)
for i, r in enumerate(results, 1):
    print(f"\n[{i}] Similarity: {r['similarity']:.3f}")
    print(f"{r['chunk'][:300]}...")

Query: 'quantum computing'


[1] Similarity: 0.572
eloping the next generation of quantum technology, including quantum cryptography, quantum computers, and quantum sensors.

[Press release](https://www.nobelprize.org/prizes/physics/2025/press-release/) [Popular information: Quantum properties on a human scale](https://www.nobelprize.org/prizes/phys...

[2] Similarity: 0.515
an electric circuit.”

Quantum tunneling describes the phenomenon of particles penetrating a barrier when they don’t appear to have enough energy to do so. The committee [noted](https://www.nobelprize.org/prizes/physics/2025/popular-information/) that while an everyday object like a ball—which is ma...

[3] Similarity: 0.485
obelprize.org/prizes/physics/2025/press-release/)

A major question in physics is the maximum size of a system that can demonstrate quantum mechanical effects. This year’s Nobel Prize laureates conducted experiments with an electrical circuit in which they demonstrated both quantum mechanical t

In [54]:
top_chunks = [r['chunk'] for r in results]
selected_text = "\n\n".join(top_chunks)
selected_tokens = len(enc.encode(selected_text))
print(f"Total tokens: {total_tokens}")
print(f"Selected tokens: {selected_tokens}")
print(f"Savings rate: {(1 - selected_tokens / total_tokens) * 100:.1f}%")

Total tokens: 18535
Selected tokens: 311
Savings rate: 98.3%


In [56]:
response = tavily.search(
    "2025 Nobel Prize winners",
    search_depth="advanced",
    chunks_per_source=3
)

In [57]:
print(response)

{'query': '2025 Nobel Prize winners', 'follow_up_questions': None, 'answer': None, 'images': [], 'results': [{'url': 'https://m.economictimes.com/news/international/world-news/nobel-prize-2025-winners-heres-the-full-list-across-all-categories/articleshow/124336117.cms', 'title': "Nobel Prize 2025 Winners: Here's the full list across all categories", 'content': '|  |  |  |\n --- \n| Category | Winners | Research Area |\n| Physiology or Medicine | Mary Brunkow, Fred Ramsdell and Shimon Sakaguchi | Immune system |\n| Physics | John Clarke, Michel H. Devoret and John M. Martinis | Macroscopic quantum mechanical tunnelling |\n| Chemistry | Susumu Kitagawa, Richard Robson and Omar Yaghi | Development of metal–organic frameworks |\n| Literature | László Krasznahorkai | Compelling and visionary oeuvre |\n| Peace | María Corina Machado | Promoting democratic rights for people of Venezuela |\n| Economics | Joel Mokyr, Philippe Aghion and Peter Howitt | Innovation-led economic growth | [...] NHL 