# Semantic Search

In [2]:
import os
import pprint
import time

from dotenv import load_dotenv
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
from pymongo.operations import SearchIndexModel

# Load environment variables
load_dotenv()

True

In [3]:
# Load API keys and connection strings
OPENAI_API_KEY: str | None = os.getenv(key="OPENAI_API_KEY", default=None)
MONGODB_CONNECTION_STRING: str | None = os.getenv(
    key="MONGODB_CONNECTION_STRING", default=None
)

# Initialize MongoDB client
client: MongoClient = MongoClient(
    host=MONGODB_CONNECTION_STRING, tls=True, tlsAllowInvalidCertificates=True
)

In [4]:
db_name: str = "embeddings"
collection_name: str = "text"

coll = client[db_name][collection_name]
vector_search_index = "vector_index"

# Clear existing documents
coll.delete_many(filter={})

DeleteResult({'n': 3, 'electionId': ObjectId('7fffffff0000000000000209'), 'opTime': {'ts': Timestamp(1736583135, 5), 't': 521}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1736583135, 5), 'signature': {'hash': b'\\\xeb\x10\xc4\xf4\xb5\nO\xd8\xa0\x8b\xdb\x87\xd3\xfd\xad>\x05\xd8}', 'keyId': 7416969346901082113}}, 'operationTime': Timestamp(1736583135, 5)}, acknowledged=True)

In [5]:
# Sample texts
texts: list[str] = [
    "A martial artist agrees to spy on a reclusive crime lord using his invitation to a tournament there as cover.",
    "A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control of the universe.",
    "When a boy wishes to be big at a magic wish machine, he wakes up the next morning and finds himself in an adult body.",
]


In [6]:
# Initialize embedding model
embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=768,
    disallowed_special=(),
)

In [7]:
# Generate embeddings
embeddings = embedding_model.embed_documents(texts=texts)

In [8]:
docs = []
for i in range(len(texts)):
    docs.append(
        {
            "text": texts[i],
            "embedding": embeddings[i],
        }
    )


In [9]:
try:
    coll.insert_many(documents=docs)
    print("Documents inserted successfully")
except Exception as e:
    print(f"An error occurred during insertion: {e}")

print("Documents embedded and inserted successfully")


Documents inserted successfully
Documents embedded and inserted successfully


In [10]:
time.sleep(10)  # Allow time for indexing

In [11]:
# Semantic queries
semantic_queries: list[str] = [
    "Secret Agent captures underworld boss.",
    "Awkward team of space defenders.",
    "A magical tale of growing up.",
]

In [12]:
# Check if vector search index already exists
existing_indexes = list(coll.list_search_indexes())
index_exists = any(index["name"] == vector_search_index for index in existing_indexes)

In [13]:
if not index_exists:
    # Define search index model
    search_index_model = SearchIndexModel(
        definition={
            "fields": [
                {
                    "type": "vector",
                    "path": "embedding",
                    "similarity": "dotProduct",
                    "numDimensions": 768,  # Corrected dimension to match embedding model
                }
            ]
        },
        name=vector_search_index,
        type="vectorSearch",
    )
    coll.create_search_index(model=search_index_model)
    print("Vector search index created successfully.")
else:
    print("Vector search index already exists.")


Vector search index already exists.


In [14]:
# Initialize vector search
vector_search = MongoDBAtlasVectorSearch(
    collection=coll,
    embedding=embedding_model,
    index_name=vector_search_index,
)


In [15]:
# Perform semantic search
for query in semantic_queries:
    result = vector_search.similarity_search_with_score(
        query=query,
        k=3,
    )
    print("SEMANTIC QUERY:", query)
    print("RANKED RESULTS:")
    pprint.pprint(result)
    print("\n")


SEMANTIC QUERY: Secret Agent captures underworld boss.
RANKED RESULTS:
[(Document(metadata={'_id': '67822827001471bb2c2d8029'}, page_content='A martial artist agrees to spy on a reclusive crime lord using his invitation to a tournament there as cover.'),
  0.7542420625686646),
 (Document(metadata={'_id': '67822827001471bb2c2d802a'}, page_content='A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control of the universe.'),
  0.6448467969894409),
 (Document(metadata={'_id': '67822827001471bb2c2d802b'}, page_content='When a boy wishes to be big at a magic wish machine, he wakes up the next morning and finds himself in an adult body.'),
  0.591801106929779)]


SEMANTIC QUERY: Awkward team of space defenders.
RANKED RESULTS:
[(Document(metadata={'_id': '67822827001471bb2c2d802a'}, page_content='A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control of the universe.'),
  0.7935056686