In [14]:

import os, json, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from openai import OpenAI

#  Parameters 
CHUNKS_CSV = "chunks.csv"           # input produced in Notebook 02
EMBED_MODEL = "text-embedding-3-small"
MAX_ROWS = 1500                      # cap to keep it fast
TEXT_COL = "chunk_text"
META_COLS = ["talk_id", "chunk_id", "title", "speaker"]  # saved alongside embeddings

EMBED_OUT_NPY = "embeddings.npy"
EMBED_META_CSV = "embedding_meta.csv"

# Create client (reads key from env)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
print(" OpenAI client ready (make sure OPENAI_API_KEY is set)")


✔️ OpenAI client ready (make sure OPENAI_API_KEY is set)


In [8]:

df = pd.read_csv(CHUNKS_CSV, usecols=META_COLS + [TEXT_COL])
print(df.shape)
df.head(2)


(16128, 5)


Unnamed: 0,talk_id,chunk_id,title,speaker,chunk_text
0,0,0,Averting the climate crisis,Al Gore,"Thank you so much, Chris. And it's truly a gre..."
1,0,1,Averting the climate crisis,Al Gore,""" And I thought to myself, in the middle of th..."


In [15]:

if len(df) > MAX_ROWS:
    df = df.sample(MAX_ROWS, random_state=42).reset_index(drop=True)
print("Using rows:", len(df))


Using rows: 1500


In [16]:

import math, time

def embed_texts(texts, batch_size=64, sleep=0.5):
    """Simple helper that embeds texts in small batches."""
    out = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        # API call
        resp = client.embeddings.create(model=EMBED_MODEL, input=batch)
        vecs = [item.embedding for item in resp.data]
        out.extend(vecs)
        # polite pacing
        if sleep and i + batch_size < len(texts):
            time.sleep(sleep)
        print(f"Embedded {min(i+batch_size, len(texts))}/{len(texts)}", end="\r")
    print()
    return np.array(out, dtype=np.float32)

emb = embed_texts(df[TEXT_COL].fillna(" ").tolist())
print("Embeddings shape:", emb.shape)

# Save artifacts
np.save(EMBED_OUT_NPY, emb)
df[META_COLS].to_csv(EMBED_META_CSV, index=False)
print("Saved:", EMBED_OUT_NPY, "and", EMBED_META_CSV)


Embedded 1500/1500
Embeddings shape: (1500, 1536)
Saved: embeddings.npy and embedding_meta.csv


In [18]:

def cosine_sim(a, b):
    a = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-8)
    b = b / (np.linalg.norm(b) + 1e-8)
    return np.dot(a, b)

def search(query, top_k=5):
    # Embed the query
    q = client.embeddings.create(model=EMBED_MODEL, input=[query]).data[0].embedding
    q = np.array(q, dtype=np.float32)

    sims = cosine_sim(emb, q)
    idx = np.argsort(-sims)[:top_k]
    results = df.iloc[idx].copy()
    results["score"] = sims[idx]
    return results

# Example:
query = "hopeful ending with a call to action about climate"  # <— change me
hits = search(query, top_k=5)
hits[["talk_id","chunk_id","title","speaker","score"]]


Unnamed: 0,talk_id,chunk_id,title,speaker,score
994,158,3,The case for optimism,Larry Brilliant,0.580847
113,2117,0,The inside story of the Paris climate agreement,Christiana Figueres,0.567468
906,3722,9,How we can turn the tide on climate,Chris Anderson,0.566015
250,1112,4,Why I must speak out about climate change,James Hansen,0.565192
938,3722,1,How we can turn the tide on climate,Chris Anderson,0.542857


In [19]:

if len(hits):
    r0 = hits.iloc[0]
    print(f"Top match — talk_id={r0['talk_id']}, chunk_id={r0['chunk_id']}, score={r0['score']:.4f}\n")
    print(df.loc[(df["talk_id"]==r0["talk_id"]) & (df["chunk_id"]==r0["chunk_id"]), TEXT_COL].values[0][:1200])


Top match — talk_id=158, chunk_id=3, score=0.5808

Human beings have always risen to the challenge. You just need to look at the list of Nobel laureates to remind ourselves. We've been here before, paralyzed by fear, paralyzed into inaction, when some — probably one of you in this room — jumped into the breach and created an organization like Physicians for Social Responsibility, which fought against the nuclear threat, Medicins Sans Frontieres, that renewed our commitment to disaster relief, Mohamed ElBaradei, and the tremendous hope and optimism that he brought all of us, and our own Muhammad Yunus. We've seen the eradication of smallpox. We may see the eradication of polio this year. Last year, there were only 2,000 cases in the world. We may see the eradication of guinea worm next year — there are only 35,000 cases left in the world. 20 years ago, there were three and a half million. And we've seen a new disease, not like the 30 novel emerging communicable diseases. This disease is