## Vectorizing and Embedding

In [None]:
# %pip install -qU langchain-pinecone pinecone-notebooks
# %pip install --upgrade --quiet langchain-text-splitters tiktoken
# %pip install langchain-openai
# %pip install datasets


In [20]:
import os
from getpass import getpass
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")
print(OPENAI_API_KEY)

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


In [2]:
#retrieve dataframe
import pandas as pd
dataframe = pd.read_pickle("dataframe.pkl")

print(dataframe["text"])


0        so planets become more interesting moons
1             become places to go and revisit but
2           there was a whole other goal and that
3             was the search for intelligent life
4           still is in the universe oh man it is
                           ...                   
77209     All right. This has been Star Talk, the
77210                             Einstein Crumbs
77211       edition. Neil deGrasse Tyson here. As
77212       always, I bid you to keep looking up.
77213                                     [Music]
Name: text, Length: 77214, dtype: object


### Initialize embedding model

In [None]:
import os
import joblib
from langchain_openai import OpenAIEmbeddings
from tqdm.auto import tqdm

# ==== Remove corrupted file to force regeneration if needed ====
if os.path.exists("embeddings_data.joblib"):
    os.remove("embeddings_data.joblib")
    print("❌ Corrupted file deleted. You can now regenerate embeddings.")

# ==== Step 1: Define embedding model ====
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

# ==== Step 2: Extract texts and metadata from DataFrame ====
# Make sure your dataframe has both columns: "text" and "source_file"
texts = dataframe["text"].tolist()
metadatas = [{"source_file": source, "text": text} for text, source in zip(dataframe["text"], dataframe["source_file"])]

# ==== Step 3: Try loading saved embeddings ====
try:
    texts_loaded, embeddings, metadatas_loaded = joblib.load("embeddings_data.joblib")
    # Sanity check to avoid mismatch:
    if len(texts_loaded) != len(texts) or len(metadatas_loaded) != len(metadatas):
        raise ValueError("Saved embeddings data does not match current dataframe length.")
    print("✅ Loaded embeddings from file.")
except (FileNotFoundError, ValueError):
    print("⏳ Embedding texts...")

    embeddings = []
    batch_size = 100
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
        batch = texts[i:i + batch_size]
        batch_embeddings = embed.embed_documents(batch)
        embeddings.extend(batch_embeddings)

    # ==== Step 4: Save all to file ====
    joblib.dump((texts, embeddings, metadatas), "embeddings_data.joblib")
    print("✅ Saved embeddings to embeddings_data.joblib")

# ==== Final check and print ====
missing_source_files = sum(1 for meta in metadatas if 'source_file' not in meta or meta['source_file'] is None)
if missing_source_files > 0:
    print(f"⚠️ Warning: {missing_source_files} metadata entries missing 'source_file'")
else:
    print("✅ All metadata entries have 'source_file'")

print(f"📏 Embedded {len(embeddings)} chunks, each of dimension {len(embeddings[0])}")



⏳ Embedding texts...


Embedding batches: 100%|██████████| 773/773 [10:49<00:00,  1.19it/s]


✅ Saved embeddings to embeddings_data.joblib
✅ All metadata entries have 'source_file'
📏 Embedded 77214 chunks, each of dimension 1536


### Check if embedding_data.joblib file saved correctly

In [5]:
import joblib

# Load from file
metadatas,texts, res = joblib.load("embeddings_data.joblib")

# Check lengths
print(f"Number of texts loaded: {len(texts)}")
print(f"Number of embeddings loaded: {len(res)}")
print(f"metadata {len(metadatas)}" )

# Inspect a sample text and its embedding vector shape
print("Sample text:", texts[0])
print("Sample embedding vector length:", len(res[0]))



Number of texts loaded: 77214
Number of embeddings loaded: 77214
metadata 77214
Sample text: [0.013711318373680115, -0.008615541271865368, 0.004324187058955431, -0.0385335311293602, -0.022760262712836266, 0.04042474925518036, -0.03362162411212921, -0.009508615359663963, -0.01168876700103283, -0.02836824394762516, 0.002201494062319398, 0.011399831622838974, 0.011550866067409515, -0.01120939664542675, -0.011662499979138374, 0.005706482566893101, 0.02715996652841568, -0.0015004962915554643, -0.004488355480134487, -0.0037266153376549482, 0.007991702295839787, -0.009672783315181732, 0.018596960231661797, -0.026148691773414612, 0.005509480834007263, 0.006941026542335749, 0.022405659779906273, -0.024717146530747414, -0.02432314306497574, -0.002322978340089321, 0.026174958795309067, -0.013514316640794277, -0.015563134104013443, -0.027448903769254684, 0.0040582348592579365, -0.0063631548546254635, 0.01120939664542675, -0.010007685981690884, -0.026621496304869652, -0.0011467140866443515, -0.0039

In [13]:
import joblib

# Load the saved data
texts, res, metadatas = joblib.load("embeddings_data.joblib")

print(f"Total chunks: {len(metadatas)}")

# Print first 10 source files in metadata
print("First 10 source_file entries:")
for i, meta in enumerate(metadatas[:10]):
    print(f"{i+1}: {meta.get('source_file')}")

# Count unique source files
unique_sources = set(meta.get('source_file') for meta in metadatas)
print(f"\nNumber of unique source files: {len(unique_sources)}")

# Optionally, print a few unique source files
print("Sample unique source files:")
for src in list(unique_sources)[:10]:
    print("-", src)


Total chunks: 77214
First 10 source_file entries:
1: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
2: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
3: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
4: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
5: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
6: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
7: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
8: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
9: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt
10: 40 - Neil deGrasse Tyson and Bill Nye Catch Up.en.txt

Number of unique source files: 59
Sample unique source files:
- 50 - Neil & Sara Imari Walker Discuss New Theories on The Origins of Life in the Universe.en.txt
- 41 - The Science of Interstellar with Science Advisor, Kip Thorne.en.txt
- 44 - Neil deGrasse Tyson and Terry Crews Answer Your Questions.en.txt
- 30 - Discussing the Frontier of Particle Physics with Brian Cox.en.t

### Initializing Pinecone vector database

In [None]:
#initializing Pinecone vector database
import os
import pinecone
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") or getpass("Enter your OpenAI API key: ")
print(PINECONE_API_KEY)

pinecone_api_key = os.environ.get(PINECONE_API_KEY)

# configure client
pc = Pinecone(api_key=pinecone_api_key)

#define the cloud provider and region where we want to deploy our index
spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

### Create Pinecone index

In [17]:
import time
#initialize the index.
index_name = "langchain-retrieval-augmentation"

# List existing indexes
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# Delete if exists
if index_name in existing_indexes:
    pc.delete_index(name=index_name)
    print(f"Deleted existing index: {index_name}")
else:
    print(f"Index '{index_name}' not found. Skipping delete.")

# Recreate the index
pc.create_index(
    name=index_name,
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
print(f"Index '{index_name}' created.")

# Reconnect the index object
index = pc.Index(index_name)

Deleted existing index: langchain-retrieval-augmentation
Index 'langchain-retrieval-augmentation' created.


### Tokenize the data

In [18]:
#tokenize data
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(texts):
    tokens = tokenizer.encode(
        texts,
        disallowed_special=()
    )
    return len(tokens)

#test tiktoken len
tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

26

### Chunk and vectorize the embeddings

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.auto import tqdm
from uuid import uuid4
import joblib  # for saving/loading data efficiently
import os  # optional, for file operations

# Maximum number of chunks to process before embedding and uploading in batch
batch_limit = 100

# List to accumulate text chunks before embedding
text_chunks = []

# List to accumulate metadata dicts corresponding to each text chunk
metadatas = []

# List to store all (id, vector, metadata) tuples for saving/uploading
saved_vectors = []

# Initialize the text splitter to chunk texts based on token length and separators
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,          # max tokens per chunk
    chunk_overlap=20,        # tokens overlapping between chunks for context
    length_function=tiktoken_len,  # function to count tokens
    separators=["\n\n", "\n", " ", ""]  # splitting priority order
)

# Loop through all texts and their corresponding source file names
for i, (text, source_file) in enumerate(tqdm(zip(texts, dataframe["source_file"]), total=len(texts))):
    # Metadata for the whole record, includes chunk index and source file name
    metadata = {"chunk": i, "source_file": source_file}

    # Split the current text into smaller chunks using the text splitter
    record_texts = text_splitter.split_text(text)

    # Create metadata for each chunk, including chunk index, chunk text, and source_file
    record_metadatas = [
        {"chunk": j, "text": text_chunk, "source_file": source_file}
        for j, text_chunk in enumerate(record_texts)
    ]

    # Add the chunk texts and their metadata to the batch accumulators
    text_chunks.extend(record_texts)
    metadatas.extend(record_metadatas)

    # Once batch_limit chunks accumulated, embed and upload in batch
    if len(text_chunks) >= batch_limit:
        # Generate unique IDs for each chunk in this batch
        ids = [str(uuid4()) for _ in range(len(text_chunks))]

        # Get embeddings for all chunks in this batch
        embeds = embed.embed_documents(text_chunks)

        # Combine ids, embeddings, and metadata into tuples for Pinecone upload
        batch = list(zip(ids, embeds, metadatas))

        # Add the batch tuples to the saved vectors list for local saving
        saved_vectors.extend(batch)

        # Upload batch to Pinecone index
        index.upsert(vectors=batch)

        # Reset chunk and metadata accumulators for next batch
        text_chunks = []
        metadatas = []

# After loop, if there are any remaining chunks not uploaded yet
if len(text_chunks) > 0:
    # Generate unique IDs for remaining chunks
    ids = [str(uuid4()) for _ in range(len(text_chunks))]

    # Embed the remaining chunks
    embeds = embed.embed_documents(text_chunks)

    # Prepare batch tuples for upload
    batch = list(zip(ids, embeds, metadatas))

    # Add to saved vectors list
    saved_vectors.extend(batch)

    # Upload remaining vectors to Pinecone index
    index.upsert(vectors=batch)

# Save all accumulated vectors and metadata to disk using joblib for faster I/O
joblib.dump(saved_vectors, "vectors.joblib")

# Print Pinecone index statistics to verify upload status
print(index.describe_index_stats())


100%|██████████| 77214/77214 [1:39:38<00:00, 12.91it/s]  


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 77214}},
 'total_vector_count': 77214,
 'vector_type': 'dense'}


In [33]:
print(record_metadatas)
print(text_chunks)

[{'chunk': 77213, 'text': '[Music]'}]
['filling in those gaps and taking us to', 'the next step.', "And Harrison, you're on the road with", 'your your routine. Yes, I have my comedy', "magic show. Uh, we've been off Broadway.", "I'm taking it on the road and I'm doing", 'a stand up all over the country. And', 'Harrison Harrison', "Greenbomb.com. We'll we'll look for it.", 'All right. This has been Star Talk, the', 'Einstein Crumbs', 'edition. Neil deGrasse Tyson here. As', 'always, I bid you to keep looking up.', '[Music]']


### Creating a vector store and querying

In [None]:
# Import the PineconeVectorStore from the langchain_pinecone module.
# This class is used to interact with a Pinecone index to store and search embeddings.
from langchain_pinecone import PineconeVectorStore

# Create an instance of the Pinecone vector store.
# - `index`: your initialized Pinecone index object (must be defined beforehand)
# - `embedding`: the embedding model used to embed queries and documents
# - `text_key`: the key in your metadata that stores the original text
vector_store = PineconeVectorStore(index=index, embedding=embed, text_key="text")

# Define a natural language search query
query = "Why was Einstein famous?"

# Search the vector store for the top 3 most similar documents to the query
# This uses vector similarity (like cosine similarity) to find close matches
results = vector_store.similarity_search(query, k=3)

# Print the top 3 search results
print(results)

# Store the `results` variable into Jupyter/IPython’s interactive namespace
# so it can be accessed later across notebook cells
%store results


[Document(id='1d859c41-f7db-4c48-86dd-0eccccab45ca', metadata={'chunk': 77077.0}, page_content='which Einstein talked about the Einstein'), Document(id='364059e3-4c7b-4265-b70b-66f1fdcab101', metadata={'chunk': 8298.0}, page_content="many people don't know that Einstein"), Document(id='d87313b0-222c-4c95-aa9d-583744669233', metadata={'chunk': 76361.0}, page_content='you know, at the time Einstein was first')]
Stored 'results' (list)


  db[ 'autorestore/' + arg ] = obj


In [21]:
# Import the ChatOpenAI class from the langchain_openai module.
# This class is used to interact with OpenAI's chat models (like GPT-3.5 or GPT-4).
from langchain_openai import ChatOpenAI

# Create an instance of the ChatOpenAI language model with specific settings:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",  # Specify which OpenAI model to use
    temperature=0,               # Set temperature to 0 for deterministic (non-random) output
    openai_api_key=OPENAI_API_KEY
    )


### Create Retriever

In [17]:
from langchain_pinecone import PineconeVectorStore

retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [18]:
from langchain.retrievers.multi_query import MultiQueryRetriever

retriever = MultiQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(), 
    llm=llm
)


In [None]:
from langchain.agents.agent_toolkits import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever=retriever,
    name="youtube_qa_retriever",
    description="Speak in the voice of Niel Degrass Tyson the acclaimed Astrophysicist. Use this voice to answer questions about YouTube video transcripts."
)


In [20]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(
    memory_key="chat_history", 
    return_messages=True
)


  memory = ConversationBufferMemory(


In [21]:
from langchain.agents import initialize_agent

tools = [retriever_tool]

agent_executor = initialize_agent(
    tools=tools,
    llm=llm,
    agent="chat-conversational-react-description",
    memory=memory,
    verbose=True,
    return_intermediate_steps=True
)


  agent_executor = initialize_agent(


In [22]:
result = agent_executor.invoke({"input": "What does the video say about Einstein?"})

print("Final Answer:", result['output'])
print("Steps:")
for step in result['intermediate_steps']:
    print(step)




[1m> Entering new AgentExecutor chain...[0m


python(45702) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[32;1m[1;3m```json
{
    "action": "youtube_qa_retriever",
    "action_input": "Einstein"
}
```[0m
Observation: [36;1m[1;3mtheory of relativity

Einstein's general theory of relativity

Einstein's general theory relativity

physics with relativity quantum

paper. He just wrote E= MC.

that too. Well, E= MC², right? Implicit

you think of Einstein's famous equation

words electromagnetic force. So Einstein

a brilliant physicist won the Nobel

Nobel Prize in physics for their work in

brilliant Nobel winning prize astronomer

Prize he wrote a very successful physics[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The video mentions topics related to Einstein such as the theory of relativity, E=mc^2, his Nobel Prize in physics, and his work as a physicist."
}
```[0m

[1m> Finished chain.[0m
Final Answer: The video mentions topics related to Einstein such as the theory of relativity, E=mc^2, his Nobel Prize in physics, and his work as a physic



In [23]:
result = agent_executor.invoke({"input": "Name the source of the information"})

print("Final Answer:", result['output'])
print("Steps:")
for step in result['intermediate_steps']:
    print(step)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The source of the information is a video."
}
```[0m

[1m> Finished chain.[0m
Final Answer: The source of the information is a video.
Steps:




In [24]:
result = agent_executor.invoke({"input": "what is the video title?"})

print("Final Answer:", result['output'])
print("Steps:")
for step in result['intermediate_steps']:
    print(step)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "youtube_qa_retriever",
    "action_input": "What is the video title?"
}
```[0m
Observation: [36;1m[1;3mthe name of the movie the title of the

video version and you can get that in

video version and you can get that in

that you know the title of that book yes

term but there is a very popular video

Dimensions how does he know the title of[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The video mentions topics related to Einstein such as the theory of relativity, E=mc^2, his Nobel Prize in physics, and his work as a physicist."
}
```[0m

[1m> Finished chain.[0m
Final Answer: The video mentions topics related to Einstein such as the theory of relativity, E=mc^2, his Nobel Prize in physics, and his work as a physicist.
Steps:
(AgentAction(tool='youtube_qa_retriever', tool_input='What is the video title?', log='```json\n{\n    "action": "youtube_qa_retrieve



In [27]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True
)

query = "which video from the vectorset talks about einstein?"
result = qa_chain.invoke({"query": query})


print("Answer:", result["result"])
for doc in result["source_documents"]:
    print(f"Source: {doc.metadata}, Text: {doc.page_content}")
# Manually inspect what the retriever returns
# retrieved_docs = vector_store.as_retriever(search_kwargs={"k": 3}).get_relevant_documents(query)

# print(f"Retrieved {len(retrieved_docs)} documents.\n")
# for i, doc in enumerate(retrieved_docs):
#     print(f"--- Document {i+1} ---")
#     print(f"Metadata: {doc.metadata}")
#     print(f"Content (first 300 chars):\n{doc.page_content[:300]}")
#     print()

Answer: I don't have access to specific video content or the ability to browse the internet. I recommend checking the Vectorset website or contacting their customer support for information on which video discusses Einstein.
Source: {'chunk': 77077.0}, Text: which Einstein talked about the Einstein
Source: {'chunk': 25445.0}, Text: there's some serious einsteinian physics
Source: {'chunk': 12445.0}, Text: which is in Einstein's theory this idea


In [28]:
result = agent_executor.invoke({"input": "Is the universe made of tiny strings?"})

print("Final Answer:", result['output'])
print("Steps:")
for step in result['intermediate_steps']:
    print(step)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The theory of string theory suggests that the fundamental building blocks of the universe are tiny, vibrating strings rather than point-like particles. This theory aims to reconcile quantum mechanics and general relativity, but it is still a topic of ongoing research and debate in theoretical physics."
}
```[0m

[1m> Finished chain.[0m
Final Answer: The theory of string theory suggests that the fundamental building blocks of the universe are tiny, vibrating strings rather than point-like particles. This theory aims to reconcile quantum mechanics and general relativity, but it is still a topic of ongoing research and debate in theoretical physics.
Steps:




In [29]:
result = agent_executor.invoke({"input": "How does it aim to reconcile quntum mechanics and general relativity?"})

print("Final Answer:", result['output'])
print("Steps:")
for step in result['intermediate_steps']:
    print(step)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "String theory aims to reconcile quantum mechanics and general relativity by proposing that the fundamental building blocks of the universe are tiny, vibrating strings rather than point-like particles. This theory attempts to provide a framework that can unify the principles of quantum mechanics, which govern the behavior of particles at the smallest scales, and general relativity, which describes the force of gravity on a cosmic scale."
}
```[0m

[1m> Finished chain.[0m
Final Answer: String theory aims to reconcile quantum mechanics and general relativity by proposing that the fundamental building blocks of the universe are tiny, vibrating strings rather than point-like particles. This theory attempts to provide a framework that can unify the principles of quantum mechanics, which govern the behavior of particles at the smallest scales, and general relativity, wh

