In [5]:
import os
import pandas as pd
import ast
import numpy as np
from openai import OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec

In [2]:
# 1) Read CSV
csv_path = os.getenv("TED_CSV_PATH", r"C:\Users\shale\Downloads\ted_talks_en.csv")
if not os.path.exists(csv_path):
    raise FileNotFoundError(
        f"CSV not found at: {csv_path}. Set TED_CSV_PATH or put the file next to this script."
    )

df = pd.read_csv(csv_path)
print("Rows:", len(df))
print("Columns:", list(df.columns))

Rows: 4005
Columns: ['talk_id', 'title', 'speaker_1', 'all_speakers', 'occupations', 'about_speakers', 'views', 'recorded_date', 'published_date', 'event', 'native_lang', 'available_lang', 'comments', 'duration', 'topics', 'related_talks', 'url', 'description', 'transcript']


In [3]:
def safe_str(x) -> str:
    """Convert NaN/None to empty string, else strip."""
    if x is None:
        return ""
    if isinstance(x, float) and np.isnan(x):
        return ""
    return str(x).strip()


def safe_int(x, default=0):
    if x is None:
        return default
    if isinstance(x, float) and np.isnan(x):
        return default
    try:
        return int(x)
    except Exception:
        return default

def safe_float(x, default=0.0):
    if x is None:
        return default
    if isinstance(x, float) and np.isnan(x):
        return default
    try:
        return float(x)
    except Exception:
        return default


def parse_list_str(value) -> list[str]:
    """
    Parse CSV fields stored like "['a','b']" into list[str].
    Works for topics and available_lang if needed.
    """
    if value is None:
        return []
    if isinstance(value, float) and np.isnan(value):
        return []
    if isinstance(value, list):
        return [safe_str(v) for v in value if safe_str(v)]

    s = str(value).strip()
    if not s:
        return []
    try:
        parsed = ast.literal_eval(s)  # safe parsing
        if isinstance(parsed, list):
            return [safe_str(v) for v in parsed if safe_str(v)]
    except Exception:
        return []

    return []


def build_embed_text(row: dict) -> str:
    """
    Embedded text fields:
    title, speaker_1, event, topics, description, transcript
    """
    title = safe_str(row.get("title"))
    speaker = safe_str(row.get("speaker_1"))
    event = safe_str(row.get("event"))
    description = safe_str(row.get("description"))
    transcript = safe_str(row.get("transcript"))

    topics_list = parse_list_str(row.get("topics"))
    topics_txt = ", ".join(topics_list)

    return (
        f"Title: {title}\n"
        f"Speaker: {speaker}\n"
        f"Event: {event}\n"
        f"Topics: {topics_txt}\n"
        f"Description: {description}\n"
        f"Transcript:\n{transcript}"
    ).strip()


def build_metadata(row: dict, chunk_index: int) -> dict:
    """
    Pinecone metadata (not embedded).
    Keep it small and useful.
    """
    topics_list = parse_list_str(row.get("topics"))

    return {
        "talk_id": safe_str(row.get("talk_id")),
        "title": safe_str(row.get("title")),
        "speaker_1": safe_str(row.get("speaker_1")),
        "event": safe_str(row.get("event")),
        "url": safe_str(row.get("url")),
        "topics": topics_list,  # store parsed list for convenience
        "published_date": safe_str(row.get("published_date")),
        "recorded_date": safe_str(row.get("recorded_date")),
        "duration": safe_int(row.get("duration")),
        "views": safe_int(row.get("views")),
        "comments": safe_float(row.get("comments")),
        "native_lang": safe_str(row.get("native_lang")),
        "chunk_index": chunk_index,
    }


In [4]:
# Quick dataframe sanity stats
print("\n=== DF SHAPE ===")
print("rows, cols:", df.shape)

print("\n=== HEAD (key columns) ===")
key_cols = ["talk_id", "title", "speaker_1", "event", "published_date", "topics"]
existing = [c for c in key_cols if c in df.columns]
print(df[existing].head(5).to_string(index=False))

print("\n=== NULLS (top 12 columns) ===")
null_counts = df.isna().sum().sort_values(ascending=False)
print(null_counts.head(12).to_string())

print("\n=== SAMPLE TRANSCRIPT LENGTHS ===")
if "transcript" in df.columns:
    lens = df["transcript"].fillna("").astype(str).str.len()
    print("min:", int(lens.min()), "median:", int(lens.median()), "p90:", int(lens.quantile(0.90)), "max:", int(lens.max()))

print("\n=== TOP TOPICS (approx, raw string) ===")
if "topics" in df.columns:
    top_topics_raw = df["topics"].fillna("").astype(str).value_counts().head(10)
    print(top_topics_raw.to_string())

print("\n=== UNIQUE COUNTS (selected) ===")
for c in ["talk_id", "title", "speaker_1", "event", "native_lang"]:
    if c in df.columns:
        print(f"{c}: {df[c].nunique(dropna=True)} unique")

# Optional: parse topics into a flat frequency table (uses your parse_list_str)
if "topics" in df.columns:
    topics_series = df["topics"].apply(parse_list_str)
    flat = pd.Series([t for lst in topics_series for t in lst if t])
    print("\n=== TOP TOPICS (parsed) ===")
    print(flat.value_counts().head(15).to_string())



=== DF SHAPE ===
rows, cols: (4005, 19)

=== HEAD (key columns) ===
 talk_id                           title        speaker_1   event published_date                                                                                                                                     topics
       1     Averting the climate crisis          Al Gore TED2006     2006-06-27     ['alternative energy', 'cars', 'climate change', 'culture', 'environment', 'global issues', 'science', 'sustainability', 'technology']
      92 The best stats you've ever seen     Hans Rosling TED2006     2006-06-27 ['Africa', 'Asia', 'Google', 'demo', 'economics', 'global issues', 'health', 'statistics', 'global development', 'visualizations', 'math']
       7                Simplicity sells      David Pogue TED2006     2006-06-27                ['computers', 'entertainment', 'interface design', 'media', 'music', 'performance', 'simplicity', 'software', 'technology']
      53             Greening the ghetto    Majora 

In [5]:
# =========================
# Hyperparameters (report later in /api/stats)
# =========================
CHUNK_SIZE = 1024          # <= 2048
OVERLAP_RATIO = 0.15      # <= 0.3
TOP_K = 12                # <= 30 (used later at query time)

EMBED_MODEL = "RPRTHPB-text-embedding-3-small"

In [6]:
# =========================
# Chunking
# =========================
chunk_overlap = int(CHUNK_SIZE * OVERLAP_RATIO)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ". ", " ", ""],
)

In [7]:
row0 = df.iloc[0].to_dict()
text0 = build_embed_text(row0)
chunks0 = splitter.split_text(text0)

In [8]:
print("\nTalk 0 talk_id:", row0.get("talk_id"))
print("Embedded text length (chars):", len(text0))
print("Chunks produced:", len(chunks0))
print("\nFirst chunk preview (first 400 chars):")
print(chunks0[0][:400])

meta0 = build_metadata(row0, chunk_index=0)
print("\nMetadata example:")
print(meta0)


Talk 0 talk_id: 1
Embedded text length (chars): 12327
Chunks produced: 14

First chunk preview (first 400 chars):
Title: Averting the climate crisis
Speaker: Al Gore
Event: TED2006
Topics: alternative energy, cars, climate change, culture, environment, global issues, science, sustainability, technology
Description: With the same humor and humanity he exuded in "An Inconvenient Truth," Al Gore spells out 15 ways that individuals can address climate change immediately, from buying a hybrid to inventing a new, h

Metadata example:
{'talk_id': '1', 'title': 'Averting the climate crisis', 'speaker_1': 'Al Gore', 'event': 'TED2006', 'url': 'https://www.ted.com/talks/al_gore_averting_the_climate_crisis/', 'topics': ['alternative energy', 'cars', 'climate change', 'culture', 'environment', 'global issues', 'science', 'sustainability', 'technology'], 'published_date': '2006-06-27', 'recorded_date': '2006-02-25', 'duration': 977, 'views': 3523392, 'comments': 272.0, 'native_lang': 'en', 'chunk_

In [9]:
# llmod_api_key = api_key
# # llmod_base_url = 'https://api.llmod.ai'
# llmod_base_url = "https://api.llmod.ai/v1"

llmod_api_key = os.environ.get("LLMSTUDIO_API_KEY")
llmod_base_url = os.environ.get("LLMSTUDIO_BASE_URL")

if not llmod_api_key or not llmod_base_url:
    raise RuntimeError("Missing LLMSTUDIO environment variables")

In [3]:
# llmod_api_key

In [11]:
llmod_base_url

'https://api.llmod.ai/v1'

In [10]:
embeddings = OpenAIEmbeddings(
    model=EMBED_MODEL,
    api_key=llmod_api_key,
    base_url=llmod_base_url,
)

v = embeddings.embed_query("test")
print("vector dim:", len(v))
print("first 5:", v[:5])

vector dim: 1536
first 5: [-0.009876498021185398, 0.0015304419212043285, 0.015627983957529068, -0.05478791892528534, -0.00641245674341917]


# Creation of PineCone

In [6]:
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
# pinecone_api_key

In [11]:
from pinecone import Pinecone, ServerlessSpec
import os

# picone_key_api = pine_cone_api
# pc = Pinecone(api_key=picone_key_api)

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
if not pinecone_api_key:
    raise RuntimeError("Missing PINECONE_API_KEY env var")

pc = Pinecone(api_key=pinecone_api_key)

index_name = "ted-talks-index"

existing = [i["name"] for i in pc.list_indexes()]
if index_name not in existing:
    pc.create_index(
        name=index_name,
        dimension=1536,          # must match text-embedding-3-small
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"    # or the region Pinecone suggests
        )
    )

index = pc.Index(index_name)
print("Index ready:", index_name)

Index ready: ted-talks-index


In [12]:
# # 1) pick 1 talk, build text, chunk
# row0 = df.iloc[0].to_dict()
# talk_id0 = safe_str(row0.get("talk_id"))

# text0 = build_embed_text(row0)
# chunks0 = splitter.split_text(text0)

# print("talk_id:", talk_id0, "chunks:", len(chunks0))

In [13]:
# # 2) embed only first 5 chunks (budget-safe)
# sample_chunks = chunks0[:5]
# sample_vectors = embeddings.embed_documents(sample_chunks)

# print("sample vectors:", len(sample_vectors), "dim:", len(sample_vectors[0]))

In [14]:
# embed all chunks
from tqdm import tqdm

BATCH_EMBED = 64     # embedding batch size
BATCH_UPSERT = 200   # upsert batch size (metadata heavy, keep moderate)

upsert_buffer = []          # holds pinecone vector dicts
embed_text_buffer = []      # holds chunk texts waiting for embedding
embed_meta_buffer = []      # holds metadata dicts waiting for embedding (parallel to embed_text_buffer)
embed_id_buffer = []        # holds IDs waiting for embedding

def flush_embed_and_build_vectors():
    """
    Embeds embed_text_buffer, then builds Pinecone vector dicts in upsert_buffer.
    """
    global upsert_buffer, embed_text_buffer, embed_meta_buffer, embed_id_buffer
    if not embed_text_buffer:
        return

    vecs = embeddings.embed_documents(embed_text_buffer)

    for _id, ch, md, vec in zip(embed_id_buffer, embed_text_buffer, embed_meta_buffer, vecs):
        md["chunk"] = ch  # required for /api/prompt context[]
        upsert_buffer.append({"id": _id, "values": vec, "metadata": md})

    embed_text_buffer = []
    embed_meta_buffer = []
    embed_id_buffer = []

def flush_upsert():
    """
    Upserts upsert_buffer in batches.
    """
    global upsert_buffer
    if not upsert_buffer:
        return

    for i in range(0, len(upsert_buffer), BATCH_UPSERT):
        batch = upsert_buffer[i:i+BATCH_UPSERT]
        index.upsert(vectors=batch)

    upsert_buffer = []

total_chunks = 0

for _, row in tqdm(df.iterrows(), total=len(df), desc="Chunking+Embedding+Upserting"):
    rowd = row.to_dict()
    talk_id = safe_str(rowd.get("talk_id"))

    text = build_embed_text(rowd)
    chunks = splitter.split_text(text)

    for chunk_index, ch in enumerate(chunks):
        _id = f"{talk_id}:{chunk_index}"
        md = build_metadata(rowd, chunk_index=chunk_index)

        embed_text_buffer.append(ch)
        embed_meta_buffer.append(md)
        embed_id_buffer.append(_id)
        total_chunks += 1

        # When we have enough texts, embed them
        if len(embed_text_buffer) >= BATCH_EMBED:
            flush_embed_and_build_vectors()

        # When we have enough vectors, upsert them
        if len(upsert_buffer) >= BATCH_UPSERT:
            flush_upsert()

# flush remaining buffers
flush_embed_and_build_vectors()
flush_upsert()

print("DONE. Total chunks processed:", total_chunks)

Chunking+Embedding+Upserting: 100%|██████████████████████████████████████████████| 4005/4005 [1:01:48<00:00,  1.08it/s]


DONE. Total chunks processed: 51869


In [14]:
# # 3) upsert to Pinecone

# records = []
# for i, (ch, vec) in enumerate(zip(sample_chunks, sample_vectors)):
#     md = build_metadata(row0, chunk_index=i)
#     md["chunk"] = ch  # <-- required for your context output

#     records.append({
#         "id": f"{talk_id0}:{i}",
#         "values": vec,
#         "metadata": md
#     })

# index.upsert(vectors=records)
# print("upserted:", len(records))

upserted: 5


In [15]:
# # 4) query test
# qvec = embeddings.embed_query("climate change solutions individuals can do")
# res = index.query(vector=qvec, top_k=3, include_metadata=True)

In [16]:
# print("matches:")
# for m in res["matches"]:
#     md = m.get("metadata", {})
#     print("score:", m["score"], "talk_id:", md.get("talk_id"), "title:", md.get("title"), "chunk_index:", md.get("chunk_index"))

matches:
score: 0.477155685 talk_id: 1 title: Averting the climate crisis chunk_index: 0
score: 0.117155075 talk_id: 1 title: Averting the climate crisis chunk_index: 3
score: 0.0764770508 talk_id: 1 title: Averting the climate crisis chunk_index: 1


In [17]:
# def format_context_from_matches(matches, max_chars_per_chunk=1200):
#     context_items = []
#     context_text_parts = []

#     for m in matches:
#         md = m.get("metadata", {})
#         chunk_text = md.get("chunk")  # if you stored it, else leave empty
#         # If you did not store chunk text in metadata, see note below.

#         context_items.append({
#             "talk_id": md.get("talk_id"),
#             "title": md.get("title"),
#             "chunk": (chunk_text or "")[:max_chars_per_chunk],
#             "score": float(m.get("score", 0.0)),
#         })

#         context_text_parts.append(
#             f"Talk ID: {md.get('talk_id')}\n"
#             f"Title: {md.get('title')}\n"
#             f"Chunk:\n{(chunk_text or '')[:max_chars_per_chunk]}\n"
#         )

#     return "\n---\n".join(context_text_parts).strip(), context_items

# def answer_with_rag(question: str, matches: list, top_k: int):
#     ctx_text, ctx_items = format_context_from_matches(matches)

#     user_prompt = (
#         f"Question:\n{question}\n\n"
#         f"TED dataset context (top {top_k} retrieved chunks):\n{ctx_text}\n\n"
#         "Answer using only the context above."
#     )

#     resp = client.chat.completions.create(
#         model="RPRTHPB-gpt-5-mini",
#         messages=[
#             {"role": "system", "content": SYSTEM_PROMPT},
#             {"role": "user", "content": user_prompt},
#         ],
#         temperature=1,
#     )

#     final_answer = resp.choices[0].message.content

#     return final_answer, ctx_items, {"System": SYSTEM_PROMPT, "User": user_prompt}

In [17]:
client = OpenAI(api_key=llmod_api_key, base_url=llmod_base_url)

SYSTEM_PROMPT = (
    "You are a TED Talk assistant that answers questions strictly and "
    "only based on the TED dataset context provided to you (metadata "
    "and transcript passages). You must not use any external "
    "knowledge, the open internet, or information that is not explicitly "
    "contained in the retrieved context. If the answer cannot be "
    "determined from the provided context, respond: “I don’t know "
    "based on the provided TED data.” Always explain your answer "
    "using the given context, quoting or paraphrasing the relevant "
    "transcript or metadata when helpful."
)

def format_context_from_matches(matches, max_chars_per_chunk=1200):
    context_items = []
    context_text_parts = []

    for m in matches:
        md = m.get("metadata", {})
        chunk_text = (md.get("chunk") or "")[:max_chars_per_chunk]

        context_items.append({
            "talk_id": md.get("talk_id"),
            "title": md.get("title"),
            "chunk": chunk_text,
            "score": float(m.get("score", 0.0)),
        })

        context_text_parts.append(
            f"Talk ID: {md.get('talk_id')}\n"
            f"Title: {md.get('title')}\n"
            f"Chunk:\n{chunk_text}\n"
        )

    return "\n---\n".join(context_text_parts).strip(), context_items

def answer_with_rag(question: str, top_k: int = TOP_K):
    qvec = embeddings.embed_query(question)
    res = index.query(vector=qvec, top_k=top_k, include_metadata=True)
    matches = res.get("matches", [])

    ctx_text, ctx_items = format_context_from_matches(matches)

    user_prompt = (
        f"Question:\n{question}\n\n"
        f"TED dataset context (top {top_k} retrieved chunks):\n{ctx_text}\n\n"
        "Answer using only the context above."
    )

    resp = client.chat.completions.create(
        model="RPRTHPB-gpt-5-mini",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
        temperature=1,
    )

    final_answer = resp.choices[0].message.content
    return final_answer, ctx_items, {"System": SYSTEM_PROMPT, "User": user_prompt}



In [20]:
# Minimal tests aligned to assignment question types
tests = [
    # "Find a TED talk about data visualization. Provide the title and speaker.",
    # "Which TED talks focus on culture or global issues? Return a list of exactly 3 talk titles."
    "Find a TED talk about simplicity and why it matters. Provide the title and a short summary of the key idea.",
    # "I’m looking for a TED talk about climate change and what individuals can do in daily life. Which talk would you recommend and why?",
]

for q in tests:
    ans, ctx, aug = answer_with_rag(q, top_k=TOP_K)
    print("\n==============================")
    print("Q:", q)
    print("A:", ans)
    print("Top context item:", ctx[0] if ctx else None)


Q: Find a TED talk about simplicity and why it matters. Provide the title and a short summary of the key idea.
A: Title: Designing for simplicity (John Maeda, TED2007)

Short summary of the key idea (from the provided TED data):
John Maeda argues that simplicity is achieved by paring things down to basics and that it matters because it improves life — “simplicity is about living life with more enjoyment and less pain.” The talk comes from Maeda’s perspective at the intersection of technology and art, where learning to reduce complexity can make experiences clearer and more meaningful (description: “Here he talks about paring down to basics”; transcript: “simplicity is about living life with more enjoyment and less pain.”).
Top context item: {'talk_id': '1006', 'title': 'Simplifying complexity', 'chunk': "Title: Simplifying complexity\nSpeaker: Eric Berlow\nEvent: TEDGlobal 2010\nTopics: TED Fellows, complexity, design, science, simplicity\nDescription: Ecologist Eric Berlow doesn't fe

In [19]:
# question = "Recommend a TED talk about climate change and what individuals can do."
# qvec = embeddings.embed_query(question)
# res = index.query(vector=qvec, top_k=TOP_K, include_metadata=True)

# final_answer, ctx_items, aug_prompt = answer_with_rag(question, res["matches"], TOP_K)

# print(final_answer)
# print(ctx_items[:1])

I recommend "Averting the climate crisis" by Al Gore (TED2006). The talk is explicitly about climate change and "spells out 15 ways that individuals can address climate change immediately" — for example "from buying a hybrid to inventing a new, hotter brand name for global warming," according to the talk description in the TED data. The metadata also lists relevant topics (alternative energy, cars, climate change, environment, sustainability, etc.), so it directly addresses what individuals can do.
[{'talk_id': '1', 'title': 'Averting the climate crisis', 'chunk': 'Title: Averting the climate crisis\nSpeaker: Al Gore\nEvent: TED2006\nTopics: alternative energy, cars, climate change, culture, environment, global issues, science, sustainability, technology\nDescription: With the same humor and humanity he exuded in "An Inconvenient Truth," Al Gore spells out 15 ways that individuals can address climate change immediately, from buying a hybrid to inventing a new, hotter brand name for glo

In [None]:
# def format_context_from_matches(matches, max_chars_per_chunk=1200):
#     context_items = []
#     context_text_parts = []

#     for m in matches:
#         md = m.get("metadata", {})
#         chunk_text = md.get("chunk")  # if you stored it, else leave empty
#         # If you did not store chunk text in metadata, see note below.

#         context_items.append({
#             "talk_id": md.get("talk_id"),
#             "title": md.get("title"),
#             "chunk": (chunk_text or "")[:max_chars_per_chunk],
#             "score": float(m.get("score", 0.0)),
#         })

#         context_text_parts.append(
#             f"Talk ID: {md.get('talk_id')}\n"
#             f"Title: {md.get('title')}\n"
#             f"Chunk:\n{(chunk_text or '')[:max_chars_per_chunk]}\n"
#         )

#     return "\n---\n".join(context_text_parts).strip(), context_items

# def answer_with_rag(question: str, matches: list, top_k: int):
#     ctx_text, ctx_items = format_context_from_matches(matches)

#     user_prompt = (
#         f"Question:\n{question}\n\n"
#         f"TED dataset context (top {top_k} retrieved chunks):\n{ctx_text}\n\n"
#         "Answer using only the context above."
#     )

#     resp = client.chat.completions.create(
#         model="RPRTHPB-gpt-5-mini",
#         messages=[
#             {"role": "system", "content": SYSTEM_PROMPT},
#             {"role": "user", "content": user_prompt},
#         ],
#         temperature=1,
#     )

#     final_answer = resp.choices[0].message.content

#     return final_answer, ctx_items, {"System": SYSTEM_PROMPT, "User": user_prompt}

# validation with a bigger sample (but still small)

In [32]:
# N_TALKS = 3
# N_CHUNKS = 10

# for t in range(50,53):
#     row = df.iloc[t].to_dict()
#     talk_id = safe_str(row.get("talk_id"))

#     text = build_embed_text(row)
#     chunks = splitter.split_text(text)[:N_CHUNKS]
#     vecs = embeddings.embed_documents(chunks)

#     records = []
#     for i, (ch, vec) in enumerate(zip(chunks, vecs)):
#         md = build_metadata(row, chunk_index=i)
#         md["chunk"] = ch
#         records.append({"id": f"{talk_id}:{i}", "values": vec, "metadata": md})

#     index.upsert(vectors=records)
# print("done sample upload")

done sample upload


In [23]:
# def debug_retrieval(question: str, k: int = 10):
#     qvec = embeddings.embed_query(question)
#     res = index.query(vector=qvec, top_k=k, include_metadata=True)

#     print("\nQUESTION:", question)
#     print("TOP MATCHES:")
#     for m in res["matches"]:
#         md = m.get("metadata", {})
#         print(
#             "score:", round(float(m.get("score", 0.0)), 3),
#             "| talk_id:", md.get("talk_id"),
#             "| title:", md.get("title"),
#             "| chunk_index:", md.get("chunk_index")
#         )
#     return res

# def top_unique_titles(matches, n=3):
#     seen = set()
#     titles = []
#     for m in matches:
#         md = m.get("metadata", {})
#         tid = md.get("talk_id")
#         title = md.get("title")
#         if tid and tid not in seen:
#             seen.add(tid)
#             titles.append(title)
#         if len(titles) == n:
#             break
#     return titles
    

In [25]:
# res = debug_retrieval("Find a TED talk about photography. Provide the title and speaker.", k=2)

# res = debug_retrieval(
#     "Find a TED talk about statistics and data visualization. Provide the title and speaker.",
#     k=3
# )


QUESTION: Find a TED talk about computers. Provide the title and speaker.
TOP MATCHES:
score: 0.498 | talk_id: 7 | title: Simplicity sells | chunk_index: 0
score: 0.415 | talk_id: 7 | title: Simplicity sells | chunk_index: 2


In [48]:
# query1 = ''
# res = debug_retrieval(
#     "Find a TED talk that discusses preconceived ideas. Provide the title and speaker.",
#     k=8
# )

# answer, ctx_items, _ = answer_with_rag(
#     "Find a TED talk that discusses preconceived ideas. Provide the title and speaker.",
#     res["matches"],
#     TOP_K
# )

# print(answer)


QUESTION: Find a TED talk that discusses preconceived ideas. Provide the title and speaker.
TOP MATCHES:
score: 0.386 | talk_id: 1 | title: Averting the climate crisis | chunk_index: 0
score: 0.37 | talk_id: 1 | title: Averting the climate crisis | chunk_index: 6
score: 0.36 | talk_id: 92 | title: The best stats you've ever seen | chunk_index: 0
score: 0.352 | talk_id: 7 | title: Simplicity sells | chunk_index: 0
score: 0.343 | talk_id: 62 | title: Global priorities bigger than climate change | chunk_index: 0
score: 0.325 | talk_id: 69 | title: Dreams from endangered cultures | chunk_index: 0
score: 0.322 | talk_id: 7 | title: Simplicity sells | chunk_index: 2
score: 0.321 | talk_id: 34 | title: Photos of endangered cultures | chunk_index: 0
Title: "The best stats you've ever seen"
Speaker: Hans Rosling

Explanation: The talk's description says Hans Rosling "debunks myths about the so-called 'developing world,'" which addresses and overturns preconceived ideas (from the provided TED d

In [2]:
# def rag_test(question: str, k: int = TOP_K, show_matches: int = 5):
#     # retrieval
#     qvec = embeddings.embed_query(question)
#     res = index.query(vector=qvec, top_k=k, include_metadata=True)
#     matches = res.get("matches", [])

#     # generation
#     answer, ctx_items, aug = answer_with_rag(question, matches, k)

#     # output
#     print("\n==============================")
#     print("QUESTION:", question)
#     print("\nANSWER:\n", answer)

#     # minimal behind-the-scenes validation
#     print("\nTOP MATCHES (debug):")
#     for m in matches[:show_matches]:
#         md = m.get("metadata", {})
#         print(
#             "score:", round(float(m.get("score", 0.0)), 3),
#             "| talk_id:", md.get("talk_id"),
#             "| title:", md.get("title"),
#             "| chunk_index:", md.get("chunk_index"),
#         )

#     return {"answer": answer, "context": ctx_items, "augmented_prompt": aug, "matches": matches}


In [1]:
# q1 = "Find a TED talk about data visualization. Provide the title and speaker."
# out1 = rag_test(q1, k=8)
 
# q2 = "Which TED talks focus on culture or global issues? Return a list of exactly 3 talk titles."
# out2 = rag_test(q2, k=8)

# q3 = "Find a TED talk about simplicity and why it matters. Provide the title and a short summary of the key idea."
# out3 = rag_test(q3, k=8)

# q4 = "I’m looking for a TED talk about climate change and what individuals can do in daily life. Which talk would you recommend and why?"
# out4 = rag_test(q4, k=8)

In [7]:
import requests

url = 'https://ted-rag-vercel.vercel.app/api/prompt '
payload = {
    "question": "I’m looking for a TED talk about climate change and what individuals can do in their daily lives. give me 3 ted talks that most relevant that talks about this subject. include the titles and author of each of them."
}

res = requests.post(url, json=payload)

print("Status code:", res.status_code)
print("Response JSON:")
print(res.json())


Status code: 200
Response JSON:


In [9]:
res.json()['response']

'- The most important thing you can do to fight climate change: talk about it\n- Averting the climate crisis\n- How to transform apocalypse fatigue into action on global warming'

In [10]:
import requests

url = "https://ted-rag-vercel.vercel.app/api/prompt"
payload = {
    "question": "I’m looking for a TED talk about climate change and what individuals can do in their daily lives. give me 3 ted talks that most relevant that talks about this subject. include the titles and author of each of them."
}

res = requests.post(url, json=payload)

print("Status code:", res.status_code)
print("Response JSON:")
print(res.json())


Status code: 200
Response JSON:
{'response': 'Here are three TED Talks from the provided TED data that most directly address climate change and actions individuals can take in daily life:\n\n1) "Averting the climate crisis" — Al Gore  \n- Why it’s relevant (from the provided description/transcript): Gore “spells out 15 ways that individuals can address climate change immediately, from buying a hybrid to inventing a new…brand name for global warming.” This talk explicitly focuses on what individuals can do.\n\n2) "100 solutions to reverse global warming" — Chad Frischmann  \n- Why it’s relevant (from the provided description): Frischmann “shares solutions to climate change that exist today — conventional tactics like the use of renewable energy and better land management as well as … changes to food production, better family planning and the education of girls,” i.e., practical solutions people and communities can adopt.\n\n3) "The most important thing you can do to fight climate change

In [12]:
print(res.json()['response'])

Here are three TED Talks from the provided TED data that most directly address climate change and actions individuals can take in daily life:

1) "Averting the climate crisis" — Al Gore  
- Why it’s relevant (from the provided description/transcript): Gore “spells out 15 ways that individuals can address climate change immediately, from buying a hybrid to inventing a new…brand name for global warming.” This talk explicitly focuses on what individuals can do.

2) "100 solutions to reverse global warming" — Chad Frischmann  
- Why it’s relevant (from the provided description): Frischmann “shares solutions to climate change that exist today — conventional tactics like the use of renewable energy and better land management as well as … changes to food production, better family planning and the education of girls,” i.e., practical solutions people and communities can adopt.

3) "The most important thing you can do to fight climate change: talk about it" — Katharine Hayhoe  
- Why it’s relev