In [14]:

from pathlib import Path

# Data
DATA_PATH ="ted_talks_clean.csv"

# Columns
ID_COL = "talk_id"
TITLE_COL = "title"
URL_COL  = "url"
TEXT_COLS = ["transcript", "description"]  # prefer transcript first

# Chunking
CHUNK_SIZE = 250
CHUNK_OVERLAP = 50

# Retrieval + Generation
TOP_K = 10
FINAL_CONTEXT = 5
EMBEDDING_MODEL = "text-embedding-3-small"
CHAT_MODEL = "gpt-4o-mini"

# Cost/speed guard for class demos (set a number to limit how many talks to embed)
LIMIT_DOCS = None  # e.g., 400  (None = all)
EMB_CACHE = Path("./embeddings_minimal.jsonl")


In [7]:

import csv

rows = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for i, r in enumerate(reader):
        rows.append(r)
        if LIMIT_DOCS and len(rows) >= LIMIT_DOCS:
            break

if not rows:
    raise RuntimeError("No data loaded. Check DATA_PATH.")

print("Loaded rows:", len(rows))
print("Columns:", list(rows[0].keys())[:12], "...")


Loaded rows: 4005
Columns: ['title', 'speaker_1', 'url', 'transcript'] ...


In [8]:

def pick_text_col(headers):
    for c in TEXT_COLS:
        if c in headers:
            return c
    return None

def safe(d, k, default=""):
    v = d.get(k, default)
    return v if v is not None else default

TEXT_COL = pick_text_col(rows[0].keys())
if TEXT_COL is None:
    raise ValueError(f"No candidate text column among {TEXT_COLS}")

# normalize
for r in rows:
    r[TEXT_COL] = safe(r, TEXT_COL, "").strip()
    r[TITLE_COL] = safe(r, TITLE_COL, "").strip()
    r[URL_COL] = safe(r, URL_COL, "").strip()
    if not r.get(ID_COL):
        r[ID_COL] = str(id(r))

def chunk_text(txt, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    words = txt.split()
    if not words: return []
    chunks, i = [], 0
    step = max(1, size - overlap)
    while i < len(words):
        chunks.append(" ".join(words[i:i+size]))
        i += step
    return chunks

chunks = []
for r in rows:
    parts = chunk_text(r[TEXT_COL])
    for j, ch in enumerate(parts):
        chunks.append({
            "doc_id": r[ID_COL],
            "title": r[TITLE_COL],
            "url": r[URL_COL],
            "chunk_id": f"{r[ID_COL]}_{j}",
            "text": ch
        })

print("Total chunks:", len(chunks))


Total chunks: 37968


In [9]:

import os, json, time, urllib.request

def require_key():
    if not os.environ.get("OPENAI_API_KEY"):
        raise RuntimeError("Set OPENAI_API_KEY in your shell, then restart Jupyter.")

def openai_embed(texts, model=EMBEDDING_MODEL):
    require_key()
    url = "https://api.openai.com/v1/embeddings"
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"}
    body = {"model": model, "input": texts}
    req = urllib.request.Request(url, data=json.dumps(body).encode("utf-8"), headers=headers, method="POST")
    with urllib.request.urlopen(req) as resp:
        data = json.loads(resp.read().decode("utf-8"))
    return [d["embedding"] for d in data["data"]]

# Embed in batches and cache to jsonl
BATCH = 64
if not EMB_CACHE.exists():
    with open(EMB_CACHE, "w", encoding="utf-8") as out:
        for i in range(0, len(chunks), BATCH):
            batch = chunks[i:i+BATCH]
            vecs = openai_embed([c["text"] for c in batch])
            for c, v in zip(batch, vecs):
                out.write(json.dumps({"chunk_id": c["chunk_id"], "doc_id": c["doc_id"], "title": c["title"], "url": c["url"], "vec": v}) + "\n")
            print(f"Embedded {i+len(batch)}/{len(chunks)}")
            time.sleep(0.25)  # polite pacing
else:
    print("Using cached embeddings at", EMB_CACHE)


Embedded 64/37968
Embedded 128/37968
Embedded 192/37968
Embedded 256/37968
Embedded 320/37968
Embedded 384/37968
Embedded 448/37968
Embedded 512/37968
Embedded 576/37968
Embedded 640/37968
Embedded 704/37968
Embedded 768/37968
Embedded 832/37968
Embedded 896/37968
Embedded 960/37968
Embedded 1024/37968
Embedded 1088/37968
Embedded 1152/37968
Embedded 1216/37968
Embedded 1280/37968
Embedded 1344/37968
Embedded 1408/37968
Embedded 1472/37968
Embedded 1536/37968
Embedded 1600/37968
Embedded 1664/37968
Embedded 1728/37968
Embedded 1792/37968
Embedded 1856/37968
Embedded 1920/37968
Embedded 1984/37968
Embedded 2048/37968
Embedded 2112/37968
Embedded 2176/37968
Embedded 2240/37968
Embedded 2304/37968
Embedded 2368/37968
Embedded 2432/37968
Embedded 2496/37968
Embedded 2560/37968
Embedded 2624/37968
Embedded 2688/37968
Embedded 2752/37968
Embedded 2816/37968
Embedded 2880/37968
Embedded 2944/37968
Embedded 3008/37968
Embedded 3072/37968
Embedded 3136/37968
Embedded 3200/37968
Embedded 3264/37

In [10]:

import json, math

emb_index = []
with open(EMB_CACHE, "r", encoding="utf-8") as f:
    for line in f:
        emb_index.append(json.loads(line))

def dot(a, b): return sum(x*y for x, y in zip(a, b))
def norm(a): return math.sqrt(sum(x*x for x in a))
def cosine_sim(a, b):
    na, nb = norm(a), norm(b)
    return 0.0 if na == 0 or nb == 0 else dot(a,b)/(na*nb)

def search(query, top_k=TOP_K):
    qv = openai_embed([query])[0]
    scored = [(cosine_sim(qv, rec["vec"]), rec) for rec in emb_index]
    scored.sort(reverse=True, key=lambda x: x[0])
    out, seen = [], set()
    for s, rec in scored:
        if rec["chunk_id"] in seen: 
            continue
        seen.add(rec["chunk_id"])
        out.append({"title": rec["title"], "url": rec["url"], "doc_id": rec["doc_id"], "chunk_id": rec["chunk_id"], "score": s})
        if len(out) >= top_k:
            break
    return out


In [11]:

def build_context(retrieved, limit_words=120):
    blocks = []
    for i, r in enumerate(retrieved, start=1):
        # get chunk text from original 'chunks' list
        txt = next((c["text"] for c in chunks if c["chunk_id"] == r["chunk_id"]), "")
        words = txt.split()
        snip = " ".join(words[:limit_words]) + (" ..." if len(words) > limit_words else "")
        blocks.append(f"[{i}] {r['title']}\n{snip}")
    return "\n\n".join(blocks)

def make_prompt(question, ctx):
    return (
        "Answer the question using ONLY the context. Cite sources as [number]. "
        "If unsure, say you don't know.\n\n"
        "Context:\n" + ctx + "\n\n"
        "Question: " + question + "\n\nAnswer:"
    )

def openai_chat(prompt, model=CHAT_MODEL, temperature=0.2, max_tokens=300):
    require_key()
    url = "https://api.openai.com/v1/chat/completions"
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"}
    body = {"model": model, "messages":[{"role":"system","content":"You are a precise assistant."},{"role":"user","content":prompt}], "temperature": temperature, "max_tokens": max_tokens}
    req = urllib.request.Request(url, data=json.dumps(body).encode("utf-8"), headers=headers, method="POST")
    with urllib.request.urlopen(req) as resp:
        data = json.loads(resp.read().decode("utf-8"))
    return data["choices"][0]["message"]["content"].strip()

def ask(question, top_k=TOP_K, final_k=FINAL_CONTEXT):
    retrieved = search(question, top_k=top_k)[:final_k]
    ctx = build_context(retrieved)
    prompt = make_prompt(question, ctx)
    answer = openai_chat(prompt)
    cites = [(i+1, r["title"], r["url"]) for i, r in enumerate(retrieved)]
    return {"answer": answer, "citations": cites, "retrieved": retrieved}


In [12]:

question = "How do TED speakers describe the role of failure in innovation?"
out = ask(question)
print("\n=== Answer ===\n", out["answer"])
print("\n=== Citations ===")
for i, (idx, title, url) in enumerate(out["citations"], start=1):
    print(f"[{idx}] {title} — {url}")



=== Answer ===
 TED speakers describe the role of failure in innovation as a necessary part of the process. Myshkin Ingawale, for example, emphasized that he created technology to test for anemia and faced multiple failures before achieving success, stating, "And it didn't work. And then I made it 32 more times, and then it worked" [1]. Additionally, the idea that removing the fear of failure allows for the possibility of achieving the impossible is highlighted, suggesting that failure should not be feared but embraced as part of the innovation journey [2]. Joe Kraus also points out that persistence through failure is crucial for success, indicating that one must "persist through failure" and other challenges [3]. Overall, failure is seen as an integral component of learning and growth in the innovation process.

=== Citations ===
[1] Listening to shame — https://www.ted.com/talks/brene_brown_listening_to_shame/
[2] From mach-20 glider to hummingbird drone — https://www.ted.com/talks/