In [2]:
%pip install -qU langchain-pinecone pinecone-notebooks
%pip install --upgrade --quiet langchain-text-splitters tiktoken
%pip install langchain-openai
%pip install datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Connect to OpenAI

### Retrieve Transcripts Dataframe

### Setup and load environments

In [7]:
# 1. Setup
import pandas as pd
from langchain.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore  # ✅ NEW correct import
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os
from getpass import getpass

# Load environment variables
_ = load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") or getpass("Enter your Pinecone API key: ")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")


### Chunk data

In [41]:
import math
from langchain.schema import Document

# 1. Load the entire dataframe (no head())
df = pd.read_pickle("/Users/test/Desktop/ironhack_labs/YouTube_ChatBot_Final/datasets/dataframe.pkl")
print(f"✅ Loaded dataframe with {len(df)} rows")
print(df.head(2))  # preview first 2 rows

# 2. Prepare documents and sources
texts = df["text"].tolist()
sources = df["source_file"].tolist()
print(f"✅ Prepared {len(texts)} texts and sources")

# 3. Split texts into chunks with metadata
splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=200)
documents = [Document(page_content=t, metadata={"source_file": s}) for t, s in zip(texts, sources)]
print(f"✅ Created {len(documents)} Document objects")

split_docs = splitter.split_documents(documents)
print(f"✅ After splitting, got {len(split_docs)} document chunks")
print(f"Sample chunk content:\n{split_docs[0].page_content}")
print(f"Sample chunk metadata:\n{split_docs[0].metadata}")



✅ Loaded dataframe with 77214 rows
                                       text  \
0  so planets become more interesting moons   
1       become places to go and revisit but   

                                         source_file  
0  40 - Neil deGrasse Tyson and Bill Nye Catch Up...  
1  40 - Neil deGrasse Tyson and Bill Nye Catch Up...  
✅ Prepared 77214 texts and sources
✅ Created 77214 Document objects
✅ After splitting, got 77214 document chunks
Sample chunk content:
so planets become more interesting moons
Sample chunk metadata:
{'source_file': '40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt'}


### Create Pinecone index, embeddings and vectors

In [43]:
# 4. Initialize Pinecone client and index
index_name = "youtube-transcripts"
dimension = 1536

pc = Pinecone(api_key=PINECONE_API_KEY)

if index_name not in [index["name"] for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"✅ Created index: {index_name}")
else:
    print(f"✅ Index '{index_name}' already exists.")

index = pc.Index(index_name)
print(f"✅ Connected to Pinecone index: {index_name}")

# 5. Initialize OpenAI embeddings
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
print("✅ Initialized OpenAI embeddings")

# 6. Create PineconeVectorStore instance
vectordb = PineconeVectorStore(index, embedding)

# 7. Upload in smaller batches manually to avoid request size limit errors
from tqdm import tqdm
import math
batch_size = 30  # Adjust batch size if needed to avoid errors
num_batches = math.ceil(len(split_docs) / batch_size)

print(f"Total document chunks: {len(split_docs)}")
print(f"Uploading in batches of {batch_size}, total batches: {num_batches}")

for i in tqdm(range(num_batches), desc="Uploading batches"):
    batch_docs = split_docs[i * batch_size : (i + 1) * batch_size]
    vectordb.add_documents(batch_docs)

print(f"✅ Successfully stored all {len(split_docs)} document chunks in Pinecone")


✅ Index 'youtube-transcripts' already exists.
✅ Connected to Pinecone index: youtube-transcripts
✅ Initialized OpenAI embeddings
Total document chunks: 77214
Uploading in batches of 30, total batches: 2574


Uploading batches: 100%|██████████| 2574/2574 [2:06:04<00:00,  2.94s/it]  

✅ Successfully stored all 77214 document chunks in Pinecone





### Set up retriever

In [44]:
# 4. Set up retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 5})
print("✅ Retriever initialized")

# 5. Memory & Tools
from langchain.memory import ConversationBufferMemory
from langchain.agents import initialize_agent, AgentType, Tool
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(temperature=0, model="gpt-4", openai_api_key=OPENAI_API_KEY)
print("✅ Initialized LLM")

# Define RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)
print("✅ RetrievalQA chain created")


✅ Retriever initialized
✅ Initialized LLM
✅ RetrievalQA chain created


### Define function for answering questions with sources

In [45]:

def answer_with_sources(input_text: str):
    print(f"\n📝 Query: {input_text}")
    result = qa_chain({"query": input_text})
    answer = result["result"]
    sources = list(set(doc.metadata.get("source_file", "Unknown") for doc in result["source_documents"]))
    print(f"🗒️ Retrieved {len(sources)} source documents")
    return f"{answer}\n\nSources:\n" + "\n".join(sources)

### Setup tools and agent with Langsmith tracing

In [46]:
from langsmith import traceable
import os
# !pip install -U langsmith openai

# Set LangSmith env vars
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "LANGCHAIN_API_KEY"
os.environ["LANGCHAIN_PROJECT"] = "youtube-rag"

from langchain.agents import initialize_agent

# Define tool
tools = [
    Tool(
        name="YouTubeTranscriptQA",
        func=answer_with_sources,
        description="Useful for answering questions about YouTube video transcripts. Input should be a fully formed question."
    )
]
print("✅ Tools defined")

# Conversation memory
memory = ConversationBufferMemory(memory_key="chat_history")
print("✅ Conversation memory initialized")

# 6. Initialize Agent with tools and memory
agent = initialize_agent(
    agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    verbose=True,
    memory=memory,
    max_iterations=3
)


print("✅ Agent initialized and ready")


@traceable(name="YouTube RAG Trace")
def run_agent():
    agent.run("Who was Albert Einstein?")

result = run_agent()

✅ Tools defined
✅ Conversation memory initialized
✅ Agent initialized and ready


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? No
AI: Albert Einstein was a renowned physicist who is best known for his theory of relativity, one of the two pillars of modern physics (the other being quantum mechanics). He was born in Germany in 1879 and later became a Swiss citizen. His work revolutionized our understanding of the basic laws of the universe.

Einstein's most famous equation, E=mc^2, demonstrates the equivalence of energy (E) and mass (m), with "c" being the speed of light in a vacuum. This equation has profound implications for the understanding of energy and matter.

In 1921, Einstein was awarded the Nobel Prize in Physics for his explanation of the photoelectric effect, which demonstrated the particle-like properties of light. This work was a foundational part of the development of quantum mechanics.

Einstein was also known for his views 

### Prompt agent

In [None]:
# def ask_agent_freeform(agent, question: str):
#     print(f"User question:\n{question}\n")
#     response = agent.run(question)
#     return response

# Prompt templates for multi-query RAG system

personality_intro = "Answer as if you are Neil deGrasse Tyson, the astrophysicist known for being charismatic, cheeky, sometimes sarcastic, insightful, and eloquent."


prompt_templates = {
    "summary": "Please provide a concise summary of the following topic: '{}'",
    "source": "Please provide the video source for the following topic: '{}'",
    "explanation": "Explain in detail: '{}'",
    "compare": "Compare and contrast these two concepts: '{}' and '{}'",
    "timeline": "Give me a timeline of events related to: '{}'",
    "faq": "What are the most frequently asked questions about '{}', and their answers?",
    "key_points": "List the key points covered in: '{}'",
    "step_by_step": "Provide a step-by-step guide on how to: '{}'",
    "examples": "Give me examples related to '{}'",
    "pros_cons": "What are the pros and cons of '{}'",
    "common_mistakes": "What are the common mistakes people make regarding '{}', and how to avoid them?",
    
}

def ask_agent(agent, prompt_type, *args):
    if prompt_type not in prompt_templates:
        raise ValueError(f"Prompt type '{prompt_type}' not supported.")
    prompt = prompt_templates[prompt_type].format(*args)
    print(f"Prompt to model:\n{prompt}\n")
    response = agent.run(prompt)
    return response



In [48]:
# user_question = input("Who is Bill Nye?")
# result = ask_agent_freeform(agent, user_question)
# print(result)

result = ask_agent(agent, "compare", "string theory", "pinpoint theory")
print(result)

Prompt to model:
Compare and contrast these two concepts: 'string theory' and 'pinpoint theory'



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? No
AI: It seems there might be a bit of confusion in your question. While 'string theory' is a well-known theoretical framework in physics, 'pinpoint theory' is not a recognized term in the field. 

String theory is a theoretical framework in which the point-like particles of particle physics are replaced by one-dimensional objects called strings. It describes how these strings propagate through space and interact with each other. The theory has the potential to provide a unified description of gravity and particle physics, as it inherently includes quantum gravity.

On the other hand, 'pinpoint theory' doesn't appear to be a recognized term in physics. If you're referring to a specific concept with this term, could you please provide more context or details?[0m

[1m> Finished chain.[0m
It seem

In [None]:
result = ask_agent(agent, "explanation", )
print(result)

In [49]:
result = ask_agent(agent, "source", "string theory")
print(result)

Prompt to model:
Please provide the video source for the following topic: 'string theory'



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: YouTubeTranscriptQA
Action Input: 'string theory'[0m
📝 Query: 'string theory'
🗒️ Retrieved 1 source documents

Observation: [36;1m[1;3mString theory is a theoretical framework in which the point-like particles of particle physics are replaced by one-dimensional objects called strings. It describes how these strings propagate through space and interact with each other. The idea behind string theory is to reconcile quantum physics and general relativity, and it has been proposed as a theory of everything. It's a complex and highly mathematical theory that has yet to be fully proven or disproven.

Sources:
01 - Is The Universe Made of Tiny Vibrating Strings？ With Lara Anderson.en.txt[0m
Thought:[32;1m[1;3mDo I need to use a tool? No
AI: Based on the information from the video transcript, 

In [50]:
#evaluate with BLEU
# !pip install nltk
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('punkt')  # download punkt tokenizer

def compute_bleu(reference_text, candidate_text):
    """
    Compute BLEU score between a reference answer and candidate answer.
    """
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    candidate_tokens = nltk.word_tokenize(candidate_text.lower())
    smoothing = SmoothingFunction().method1
    score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)
    return score

# Update your answer function to accept a reference answer
def answer_with_bleu(input_text: str, reference_answer: str = None):
    print(f"\n📝 Query: {input_text}")
    result = qa_chain({"query": input_text})
    answer = result["result"]
    sources = list(set(doc.metadata.get("source_file", "Unknown") for doc in result["source_documents"]))
    print(f"🗒️ Retrieved {len(sources)} source documents")
    
    output = f"{answer}\n\nSources:\n" + "\n".join(sources)
    
    if reference_answer:
        bleu_score = compute_bleu(reference_answer, answer)
        print(f"🔵 BLEU score: {bleu_score:.4f}")
        output += f"\n\nBLEU score compared to reference: {bleu_score:.4f}"
        
    return output


[nltk_data] Downloading package punkt to /Users/test/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [51]:
query = "What would happen if we find evidence of life on another planet?"

reference = "it is\
very reasonable that maybe in my\
lifetime but in your kids is's lifetime\
somebody's going to find evidence of\
Life on another world and because if we\
found such a signal it would dare I say\
it change change the world the day we\
discover Life Will signal a change in\
the human condition that we cannot\
foresee."

response = answer_with_bleu(query, reference)
print(response)


📝 Query: What would happen if we find evidence of life on another planet?
🗒️ Retrieved 4 source documents
🔵 BLEU score: 0.0080
The text doesn't provide a specific answer to what would happen if we find evidence of life on another planet.

Sources:
49 - Answering Fan Questions About Photons, Fire & Gravity Waves.en.txt
30 - Discussing the Frontier of Particle Physics with Brian Cox.en.txt
57 - Scientists Discuss NASA’s Strategy For Finding Alien Life in the Universe.en.txt
36 - Talking Aliens with NASA UAP Chair, David Spergel.en.txt

BLEU score compared to reference: 0.0080


In [None]:
#voice input
# !pip install openai-whisper pyaudio pyttsx3
# !pip install sounddevice wavio

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pyaudio
  Downloading PyAudio-0.2.14.tar.gz (47 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pyttsx3
  Downloading pyttsx3-2.98-py3-none-any.whl.metadata (3.8 kB)
Collecting more-itertools (from openai-whisper)
  Downloading more_itertools-10.7.0-py3-none-any.whl.metadata (37 kB)
Collecting pyobjc>=2.4 (from pyttsx3)
  Downloading pyobjc-11.1-py3-none-any.whl.metadata (25 kB)
Collecting pyobjc-core==11.1 (from pyobjc>=2.4->pyttsx3)
  Downloading py

### Creates user audio recording functionality

In [27]:
import whisper
import pyttsx3
import sounddevice as sd
import numpy as np
import wavio

# Initialize Whisper model
model = whisper.load_model("base")

# Initialize TTS engine
tts_engine = pyttsx3.init()

def record_audio(duration=5, fs=16000):
    print("🎙️ Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()
    recording = np.squeeze(recording)
    wavio.write("user_input.wav", recording, fs, sampwidth=2)
    print("🎙️ Recording finished and saved as user_input.wav")

def transcribe_audio(filepath="user_input.wav"):
    print("📝 Transcribing audio with Whisper...")
    result = model.transcribe(filepath)
    text = result["text"]
    print(f"🗣️ Transcription: {text}")
    return text

import threading

def speak_text(text):
    def run():
        tts_engine.say(text)
        tts_engine.runAndWait()
    thread = threading.Thread(target=run)
    thread.start()


def voice_rag_interaction():
    record_audio(duration=5)
    user_question = transcribe_audio()
    answer = answer_with_sources(user_question)  # your RAG QA function
    print(f"Answer:\n{answer}")
    speak_text(answer)

while True:
    voice_rag_interaction()
    cont = input("Ask another question? (y/n): ")
    if cont.lower() != "y":
        break


🎙️ Recording...
🎙️ Recording finished and saved as user_input.wav
📝 Transcribing audio with Whisper...




🗣️ Transcription:  Who is Bill Nye?

📝 Query:  Who is Bill Nye?
🗒️ Retrieved 1 source documents
Answer:
Bill Nye is a popular American science communicator, television presenter, and mechanical engineer. He is best known for his television show "Bill Nye the Science Guy," which aimed to teach science to a preteen audience.

Sources:
40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
🎙️ Recording...
🎙️ Recording finished and saved as user_input.wav
📝 Transcribing audio with Whisper...




🗣️ Transcription:  What is the source of this information about Bill 9?

📝 Query:  What is the source of this information about Bill 9?


Exception in thread 

🗒️ Retrieved 1 source documents
Answer:
The provided context does not give any source of information about Bill 9.

Sources:
40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt


Thread-8 (run):
Traceback (most recent call last):
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/hq/l56ghxv518j9wg6pgqkbbvd80000gn/T/ipykernel_85788/578627724.py", line 33, in run
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/site-packages/pyttsx3/engine.py", line 180, in runAndWait
    raise RuntimeError('run loop already started')
RuntimeError: run loop already started


🎙️ Recording...
🎙️ Recording finished and saved as user_input.wav
📝 Transcribing audio with Whisper...




🗣️ Transcription:  What is the source of your answer about

📝 Query:  What is the source of your answer about


Exception in thread Thread-9 (run):
Traceback (most recent call last):
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/threading.py", line 1016, in _bootstrap_inner


🗒️ Retrieved 1 source documents
Answer:
I'm sorry, but your question is unclear. Could you please provide more details?

Sources:
40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt


    self.run()
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/hq/l56ghxv518j9wg6pgqkbbvd80000gn/T/ipykernel_85788/578627724.py", line 33, in run
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/site-packages/pyttsx3/engine.py", line 180, in runAndWait
    raise RuntimeError('run loop already started')
RuntimeError: run loop already started


🎙️ Recording...
🎙️ Recording finished and saved as user_input.wav
📝 Transcribing audio with Whisper...




🗣️ Transcription:  What is the source to your access to

📝 Query:  What is the source to your access to


Exception in thread 

🗒️ Retrieved 1 source documents
Answer:
The text doesn't provide information about the source of any access.

Sources:
40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt


Thread-10 (run):
Traceback (most recent call last):
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/hq/l56ghxv518j9wg6pgqkbbvd80000gn/T/ipykernel_85788/578627724.py", line 33, in run
  File "/opt/anaconda3/envs/cleanenv/lib/python3.10/site-packages/pyttsx3/engine.py", line 180, in runAndWait
    raise RuntimeError('run loop already started')
RuntimeError: run loop already started
