In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma

In [2]:
# 1. Our "Knowledge Base" (Raw Text)
documents = [
    Document(page_content="The Super-Widget 3000 battery lasts 48 hours."),
    Document(page_content="To reset the widget, hold the red button for 5 seconds."),
    Document(page_content="The CEO of TechCorp is Jane Doe."),
    Document(page_content="Apples are a type of fruit.")
]

In [None]:
import os

# The path you are trying to use
folder_path = "/Volumes/vibecoding/RAG-Complete Cook Book/"
file_path = os.path.join(folder_path, "test_write_permission.txt")

try:
    with open(file_path, "w") as f:
        f.write("If you see this, the drive is writable.")
    print("SUCCESS: The drive is writable.")
    # Clean up
    os.remove(file_path)
except OSError as e:
    print(f"FAILURE: Cannot write to this drive.")
    print(f"Reason: {e}")

✅ SUCCESS: The drive is writable.


In [4]:
# 2. Create the Vector Database
# This step automatically:
#   a) Chunk texts (if needed)
#   b) Calls the Embedding Model for every document
#   c) Stores the vectors in a local folder called "my_db"
db = Chroma.from_documents(
    documents=documents, 
    embedding=OpenAIEmbeddings(),
    persist_directory="/Volumes/vibecoding/RAG-Complete Cook Book/my_vector_db"
)

In [5]:
# 3. Ask a question (Retrieval)
query = "How long does the battery last?"
docs = db.similarity_search(query)

# 4. See the result
print(docs[0].page_content)

The Super-Widget 3000 battery lasts 48 hours.


In [6]:
# 3. Ask a question (Retrieval)
query =  "battery"
docs = db.similarity_search(query)

# 4. See the result
print(docs[0].page_content)

The Super-Widget 3000 battery lasts 48 hours.


In [7]:
# 3. Ask a question (Retrieval)
query = "the"
docs = db.similarity_search(query)

# 4. See the result
print(docs[0].page_content)

Apples are a type of fruit.


The "Least Worst" Winner
Vector search does not look for "matches" (like a keyword search). It looks for Distance. It calculates the distance from your query to every single document you have and returns the closest one.

When you search for "the":

The embedding model converts "the" into a vector (let's call it Point A).

It calculates the distance to the "Super Widget" vector (Point B).

It calculates the distance to the "Apple" vector (Point C).

Since "the" has zero meaning overlap with any of them, all the points are very far away. However, in the complex 1,536-dimensional space of OpenAI's model, Point A ("the") happens to be mathematically—perhaps by a tiny fraction—closer to Point C ("Apple") than Point B.

It’s like asking, "Which is closer to the North Pole: Sydney or Melbourne?" Both are very far away, but one is technically closer. The database doesn't say "Neither"; it just gives you the one that is slightly closer.