# 8. Vector Search

1. Write a simple Python function to accept search terms or phrases and pass it through the embeddings API again to get a Query Vector.

2. Take the resultant query vector and use it to perform a Vector Search query using the $vectorSearch operator in the MongoDB aggregation pipeline.

3. Pre-filter the documents using meta information to narrow the search accross your dataset, thereby speeding up the performance of the Vector Search results while retaining accuracy.

4. Further, post-filter the retrieved documents that are semantically similar (based on relevancy score), if you want to demonstrate a higher degree of control over the semantic search behavior.

In [10]:
import getpass, os, pymongo, pprint
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymongo import MongoClient
from dotenv import load_dotenv

load_dotenv()

True

In [11]:
# Load Environment Variables
OPENAI_API_KEY: str | None = os.getenv(
    key="OPENAI_API_KEY",
    default=None,
)
MONGODB_CONNECTION_STRING: str | None = os.getenv(
    key="MONGODB_CONNECTION_STRING",
    default=None,
)

In [12]:
import pandas as pd
import s3fs

In [13]:
df = pd.read_json(
    path_or_buf="https://ashwin-partner-bucket.s3.eu-west-1.amazonaws.com/movies_sample_dataset.jsonl",
    orient="records",
    lines=True,
)
df.to_json(path_or_buf="./movies_sample_dataset.jsonl", orient="records", lines=True)

print(df.shape)
df[:3]

(44435, 10)


Unnamed: 0,overview,title,release_date,vote_average,vote_count,adult,year,month,day,text
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story,1995-10-30,7.7,5415,False,1995,10,30,"Title: Toy Story Genres: Animation,Comedy,Fam..."
1,When siblings Judy and Peter discover an encha...,Jumanji,1995-12-15,6.9,2413,False,1995,12,15,"Title: Jumanji Genres: Animation,Comedy,Famil..."
2,A family wedding reignites the ancient feud be...,Grumpier Old Men,1995-12-22,6.5,92,False,1995,12,22,"Title: Grumpier Old Men Genres: Animation,Com..."


In [14]:
import numpy as np
from tqdm import tqdm
import openai

df["final"] = df["text"] + "    Overview: " + df["overview"]
df["final"][:5]

0    Title: Toy Story  Genres: Animation,Comedy,Fam...
1    Title: Jumanji  Genres: Animation,Comedy,Famil...
2    Title: Grumpier Old Men  Genres: Animation,Com...
3    Title: Waiting to Exhale  Genres: Animation,Co...
4    Title: Father of the Bride Part II  Genres: An...
Name: final, dtype: object

In [15]:
step = int(np.ceil(df["final"].shape[0] / 100))
step

445

In [16]:
embeddings_t = []
lines = []

# Note that we must split the dataset into smaller batches to not exceed the rate limits imposed by OpenAI APIs.
for x, y in list(map(lambda x: (x, x + step), list(range(0, df.shape[0], step)))):
    lines += [df.final.values[x:y].tolist()]

print(len(lines), len(lines[0]))

100 445


In [17]:
lines[0]

["Title: Toy Story  Genres: Animation,Comedy,FamilyLed by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.    Overview: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 "Title: Jumanji  Genres: Animation,Comedy,FamilyWhen siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which p

In [None]:
# openai.api_type = "openai"
# for i in tqdm(lines):
#     embeddings_t += openai.embeddings.create(
#         input=i,
#         model="text-embedding-3-large",
#     ).data

# print(len(embeddings_t), embeddings_t[0])

100%|██████████| 100/100 [06:30<00:00,  3.90s/it]

44435 Embedding(embedding=[-0.041719112545251846, 0.030147964134812355, -0.011400600895285606, 0.0001532283640699461, 0.020623410120606422, -0.007760017644613981, 0.006671122275292873, -0.015244531445205212, 0.04620588570833206, 0.02678944356739521, 0.003410996636375785, -0.03552684187889099, -0.024401744827628136, 0.013578390702605247, 0.004755716770887375, 0.0046376436948776245, -0.0024811720941215754, 0.03998737782239914, -0.010088678449392319, -0.05452347174286842, 0.04864606261253357, -0.029990533366799355, 0.012679724022746086, 0.021869735792279243, 0.01013459637761116, 0.01075119897723198, -0.003215848235413432, -0.004021040163934231, -0.033795107156038284, -0.003981682471930981, -0.009905009530484676, -0.00871772039681673, -0.005473993718624115, -0.023850739002227783, -0.03138117119669914, -0.002687799744307995, 0.012679724022746086, -0.010554411448538303, -0.060348402708768845, 0.04248002916574478, 0.012154955416917801, -0.04394938051700592, -0.00021380225371103734, -0.0163859




In [22]:
# out = []
# for element in embeddings_t:
#     out += [element.embedding]
# df["embedding"] = out
# df[:5]

Unnamed: 0,overview,title,release_date,vote_average,vote_count,adult,year,month,day,text,final,embedding
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story,1995-10-30,7.7,5415,False,1995,10,30,"Title: Toy Story Genres: Animation,Comedy,Fam...","Title: Toy Story Genres: Animation,Comedy,Fam...","[-0.041719112545251846, 0.030147964134812355, ..."
1,When siblings Judy and Peter discover an encha...,Jumanji,1995-12-15,6.9,2413,False,1995,12,15,"Title: Jumanji Genres: Animation,Comedy,Famil...","Title: Jumanji Genres: Animation,Comedy,Famil...","[-0.011309921741485596, 0.016116127371788025, ..."
2,A family wedding reignites the ancient feud be...,Grumpier Old Men,1995-12-22,6.5,92,False,1995,12,22,"Title: Grumpier Old Men Genres: Animation,Com...","Title: Grumpier Old Men Genres: Animation,Com...","[-0.011394227854907513, -0.007453804835677147,..."
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,1995-12-22,6.1,34,False,1995,12,22,"Title: Waiting to Exhale Genres: Animation,Co...","Title: Waiting to Exhale Genres: Animation,Co...","[-0.04893782362341881, 0.022149519994854927, -..."
4,Just when George Banks has recovered from his ...,Father of the Bride Part II,1995-02-10,5.7,173,False,1995,2,10,Title: Father of the Bride Part II Genres: An...,Title: Father of the Bride Part II Genres: An...,"[-0.008648938499391079, 0.023403789848089218, ..."


In [37]:
df

Unnamed: 0,overview,title,release_date,vote_average,vote_count,adult,year,month,day,text,final,embedding
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story,1995-10-30,7.7,5415,False,1995,10,30,"Title: Toy Story Genres: Animation,Comedy,Fam...","Title: Toy Story Genres: Animation,Comedy,Fam...","[-0.041719112545251846, 0.030147964134812355, ..."
1,When siblings Judy and Peter discover an encha...,Jumanji,1995-12-15,6.9,2413,False,1995,12,15,"Title: Jumanji Genres: Animation,Comedy,Famil...","Title: Jumanji Genres: Animation,Comedy,Famil...","[-0.011309921741485596, 0.016116127371788025, ..."
2,A family wedding reignites the ancient feud be...,Grumpier Old Men,1995-12-22,6.5,92,False,1995,12,22,"Title: Grumpier Old Men Genres: Animation,Com...","Title: Grumpier Old Men Genres: Animation,Com...","[-0.011394227854907513, -0.007453804835677147,..."
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,1995-12-22,6.1,34,False,1995,12,22,"Title: Waiting to Exhale Genres: Animation,Co...","Title: Waiting to Exhale Genres: Animation,Co...","[-0.04893782362341881, 0.022149519994854927, -..."
4,Just when George Banks has recovered from his ...,Father of the Bride Part II,1995-02-10,5.7,173,False,1995,2,10,Title: Father of the Bride Part II Genres: An...,Title: Father of the Bride Part II Genres: An...,"[-0.008648938499391079, 0.023403789848089218, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
44430,"Yet another version of the classic epic, with ...",Robin Hood,1991-05-13,5.7,26,False,1991,5,13,"Title: Robin Hood Genres: Animation,Comedy,Fa...","Title: Robin Hood Genres: Animation,Comedy,Fa...","[-0.032743096351623535, 0.023562563583254814, ..."
44431,An artist struggles to finish his work while a...,Century of Birthing,2011-11-17,9.0,3,False,2011,11,17,"Title: Century of Birthing Genres: Animation,...","Title: Century of Birthing Genres: Animation,...","[-0.02307790145277977, 0.014248800463974476, -..."
44432,"When one of her hits goes wrong, a professiona...",Betrayal,2003-08-01,3.8,6,False,2003,8,1,"Title: Betrayal Genres: Animation,Comedy,Fami...","Title: Betrayal Genres: Animation,Comedy,Fami...","[-0.015640441328287125, -0.0010361447930335999..."
44433,"In a small town live two brothers, one a minis...",Satan Triumphant,1917-10-21,0.0,0,False,1917,10,21,"Title: Satan Triumphant Genres: Animation,Com...","Title: Satan Triumphant Genres: Animation,Com...","[-0.004601084161549807, 0.01596786081790924, -..."


In [38]:
df.to_json(path_or_buf="./movies_sample_dataset.json", index=False)

In [39]:
import pandas as pd

saved_df = pd.read_json(
    path_or_buf="./movies_sample_dataset.json",
)
saved_df

Unnamed: 0,overview,title,release_date,vote_average,vote_count,adult,year,month,day,text,final,embedding
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story,1995-10-30,7.7,5415,False,1995,10,30,"Title: Toy Story Genres: Animation,Comedy,Fam...","Title: Toy Story Genres: Animation,Comedy,Fam...","[-0.0417191125, 0.0301479641, -0.0114006009, 0..."
1,When siblings Judy and Peter discover an encha...,Jumanji,1995-12-15,6.9,2413,False,1995,12,15,"Title: Jumanji Genres: Animation,Comedy,Famil...","Title: Jumanji Genres: Animation,Comedy,Famil...","[-0.011309921700000001, 0.0161161274, -0.02531..."
2,A family wedding reignites the ancient feud be...,Grumpier Old Men,1995-12-22,6.5,92,False,1995,12,22,"Title: Grumpier Old Men Genres: Animation,Com...","Title: Grumpier Old Men Genres: Animation,Com...","[-0.0113942279, -0.0074538048, -0.0191780198, ..."
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,1995-12-22,6.1,34,False,1995,12,22,"Title: Waiting to Exhale Genres: Animation,Co...","Title: Waiting to Exhale Genres: Animation,Co...","[-0.0489378236, 0.02214952, -0.023071851600000..."
4,Just when George Banks has recovered from his ...,Father of the Bride Part II,1995-02-10,5.7,173,False,1995,2,10,Title: Father of the Bride Part II Genres: An...,Title: Father of the Bride Part II Genres: An...,"[-0.0086489385, 0.0234037898, -0.0214832742, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...
44430,"Yet another version of the classic epic, with ...",Robin Hood,1991-05-13,5.7,26,False,1991,5,13,"Title: Robin Hood Genres: Animation,Comedy,Fa...","Title: Robin Hood Genres: Animation,Comedy,Fa...","[-0.032743096400000005, 0.0235625636, -0.03533..."
44431,An artist struggles to finish his work while a...,Century of Birthing,2011-11-17,9.0,3,False,2011,11,17,"Title: Century of Birthing Genres: Animation,...","Title: Century of Birthing Genres: Animation,...","[-0.0230779015, 0.0142488005, -0.0230779015, -..."
44432,"When one of her hits goes wrong, a professiona...",Betrayal,2003-08-01,3.8,6,False,2003,8,1,"Title: Betrayal Genres: Animation,Comedy,Fami...","Title: Betrayal Genres: Animation,Comedy,Fami...","[-0.015640441300000002, -0.0010361448, -0.0096..."
44433,"In a small town live two brothers, one a minis...",Satan Triumphant,1917-10-21,0.0,0,False,1917,10,21,"Title: Satan Triumphant Genres: Animation,Com...","Title: Satan Triumphant Genres: Animation,Com...","[-0.0046010842, 0.0159678608, -0.0208270475, -..."


### Push data to MongoDB Atlas

In [24]:
client = MongoClient(
    host=MONGODB_CONNECTION_STRING, tls=True, tlsAllowInvalidCertificates=True
)

client.list_database_names()

['CA-FOUNDATION',
 'Education',
 'ecommerce',
 'embeddings',
 'lrs',
 'meramaster',
 'mm_ai',
 'mmdev',
 'tanuja',
 'updated',
 'your_database_name',
 'admin',
 'local']

In [25]:
subrata_db = client.get_database(name="subrata")

In [27]:
movies_col = subrata_db.get_collection(name="movies")
movies_col

Collection(Database(MongoClient(host=['mmdev-shard-00-01.z7q8g.mongodb.net:27017', 'mmdev-shard-00-00.z7q8g.mongodb.net:27017', 'mmdev-shard-00-02.z7q8g.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-b4um4g-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True), 'subrata'), 'movies')

In [43]:
small_df = df[:2000]
small_df

Unnamed: 0,overview,title,release_date,vote_average,vote_count,adult,year,month,day,text,final,embedding
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story,1995-10-30,7.7,5415,False,1995,10,30,"Title: Toy Story Genres: Animation,Comedy,Fam...","Title: Toy Story Genres: Animation,Comedy,Fam...","[-0.041719112545251846, 0.030147964134812355, ..."
1,When siblings Judy and Peter discover an encha...,Jumanji,1995-12-15,6.9,2413,False,1995,12,15,"Title: Jumanji Genres: Animation,Comedy,Famil...","Title: Jumanji Genres: Animation,Comedy,Famil...","[-0.011309921741485596, 0.016116127371788025, ..."
2,A family wedding reignites the ancient feud be...,Grumpier Old Men,1995-12-22,6.5,92,False,1995,12,22,"Title: Grumpier Old Men Genres: Animation,Com...","Title: Grumpier Old Men Genres: Animation,Com...","[-0.011394227854907513, -0.007453804835677147,..."
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,1995-12-22,6.1,34,False,1995,12,22,"Title: Waiting to Exhale Genres: Animation,Co...","Title: Waiting to Exhale Genres: Animation,Co...","[-0.04893782362341881, 0.022149519994854927, -..."
4,Just when George Banks has recovered from his ...,Father of the Bride Part II,1995-02-10,5.7,173,False,1995,2,10,Title: Father of the Bride Part II Genres: An...,Title: Father of the Bride Part II Genres: An...,"[-0.008648938499391079, 0.023403789848089218, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,A boy preacher named Isaac goes to a town in N...,Children of the Corn,1984-03-09,5.6,243,False,1984,3,9,Title: Children of the Corn Genres: Animation...,Title: Children of the Corn Genres: Animation...,"[-0.002523684175685048, 0.004665933083742857, ..."
1996,When a casino owning dog named Charlie is murd...,All Dogs Go to Heaven,1989-11-17,6.3,244,False,1989,11,17,Title: All Dogs Go to Heaven Genres: Animatio...,Title: All Dogs Go to Heaven Genres: Animatio...,"[-0.018675176426768303, 0.04013621807098389, -..."
1997,Uncle Fester has been missing for 25 years. An...,The Addams Family,1991-11-22,6.7,871,False,1991,11,22,"Title: The Addams Family Genres: Animation,Co...","Title: The Addams Family Genres: Animation,Co...","[-0.003055789042264223, 0.04421965405344963, -..."
1998,"A unique 16th century woman, Danielle possesse...",Ever After: A Cinderella Story,1998-07-31,6.8,408,False,1998,7,31,Title: Ever After: A Cinderella Story Genres:...,Title: Ever After: A Cinderella Story Genres:...,"[-0.03046552650630474, 0.018499640747904778, -..."


In [45]:
# Upload documents along with vector embeddings to MongoDB Atlas Collection
if movies_col.count_documents(filter={}) > 0:
    print(
        f"Deleted all documents in collection: {movies_col.count_documents(filter={})}"
    )
    movies_col.delete_many(filter={})

movies_col.insert_many(documents=small_df.to_dict(orient="records"))
print("Done!!!")

Deleted all documents in collection: 2000
Done!!!


In [46]:
small_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   overview      2000 non-null   object 
 1   title         2000 non-null   object 
 2   release_date  2000 non-null   object 
 3   vote_average  2000 non-null   float64
 4   vote_count    2000 non-null   int64  
 5   adult         2000 non-null   bool   
 6   year          2000 non-null   int64  
 7   month         2000 non-null   int64  
 8   day           2000 non-null   int64  
 9   text          2000 non-null   object 
 10  final         2000 non-null   object 
 11  embedding     2000 non-null   object 
dtypes: bool(1), float64(1), int64(4), object(6)
memory usage: 174.0+ KB


### Create Vector Search Index

In [47]:
# Create Index
from pymongo.operations import SearchIndexModel

# Define search index model
search_index_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 3072,
                "similarity": "cosine",
            },
            {
                "type": "filter",
                "path": "year",
            },
            {
                "type": "filter",
                "path": "vote_average",
            },
            {
                "type": "filter",
                "path": "vote_count",
            },
        ]
    },
    name="movies_search_index",
    type="vectorSearch",
)

movies_col.create_search_index(model=search_index_model)
print("Vector search index created successfully.")

Vector search index created successfully.


### Query MongoDB with Vector Search
> With MongoDB you can use Vector Search + Search Filters + Aggregations to find similar documents.

In [None]:
def query_vector_search(q, prefilter={}, postfilter={}, path="embedding", topK=2):
    # 1) Create an embedding from your input query `q` using OpenAI
    ele = openai.embeddings.create(
        model="text-embedding-3-large",
        dimensions=3072,
        input=q,
    ).data

    # We assume ele is a list of embedding data. So we grab the first result.
    query_embedding = ele[0].embedding

    # 2) Build the $vectorSearch stage
    vs_query = {
        "index": "movies_search_index",  # name of the vector index in MongoDB
        "path": path,  # field in your documents where embeddings are stored
        "queryVector": query_embedding,
        "numCandidates": 10,  # how many results to consider from the index
        "limit": topK,  # final number of results we want from the search stage
    }
    if len(prefilter) > 0:
        vs_query["filter"] = prefilter
    new_search_query = {"$vectorSearch": vs_query}
    project = {
        "$project": {
            "score": {"$meta": "vectorSearchScore"},
            "_id": 0,
            "title": 1,
            "release_date": 1,
            "final": 1,
            "year": 1,
        }
    }
    if len(postfilter.keys()) > 0:
        postFilter = {"$match": postfilter}
        res = list(movies_col.aggregate([new_search_query, project, postFilter]))
    else:
        res = list(movies_col.aggregate([new_search_query, project]))
    return res


query_vector_search(
    q="I like Christmas movies, any recommendations for movies released after 1990?",
    prefilter={"year": {"$gt": 1990}},
    topK=5,
)

[{'title': 'The Santa Clause',
  'release_date': '1994-11-10',
  'year': 1994,
  'final': "Title: The Santa Clause  Genres: Animation,Comedy,FamilyScott Calvin is an ordinary man, who accidentally causes Santa Claus to fall from his roof on Christmas Eve and is knocked unconscious. When he and his young son finish Santa's trip and deliveries, they go to the North Pole, where Scott learns he must become the new Santa and convince those he loves that he is indeed, Father Christmas.    Overview: Scott Calvin is an ordinary man, who accidentally causes Santa Claus to fall from his roof on Christmas Eve and is knocked unconscious. When he and his young son finish Santa's trip and deliveries, they go to the North Pole, where Scott learns he must become the new Santa and convince those he loves that he is indeed, Father Christmas.",
  'score': 0.7195157408714294},
 {'title': 'Miracle on 34th Street',
  'release_date': '1994-11-18',
  'year': 1994,
  'final': "Title: Miracle on 34th Street  Ge

In [54]:
movies_col

Collection(Database(MongoClient(host=['mmdev-shard-00-01.z7q8g.mongodb.net:27017', 'mmdev-shard-00-00.z7q8g.mongodb.net:27017', 'mmdev-shard-00-02.z7q8g.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-b4um4g-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True), 'subrata'), 'movies')

[]