In [47]:
# Installation

!pip install --quiet datasets pandas nomic sentence-transformers einops pymongo


In [48]:
# Imports

import os
from dotenv import load_dotenv

import pandas as pd
from pathlib import Path

from nomic import embed
from sentence_transformers import SentenceTransformer

import pymongo
from tqdm.notebook import tqdm

In [49]:
# Get secrets

load_dotenv()

mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')

# Loading the data

In [50]:
# Define the path to the CSV file in an OS-agnostic way
data_path = Path("../data/netflix_titles.csv")

# Convert to pandas DataFrame
df = pd.read_csv(data_path)
print(df.head())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

# Eploring the data

In [51]:
print(df.columns)

df_subset = df[["title", "description"]]
print(df_subset.head())

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')
                   title                                        description
0   Dick Johnson Is Dead  As her father nears the end of his life, filmm...
1          Blood & Water  After crossing paths at a party, a Cape Town t...
2              Ganglands  To protect his family from a powerful drug lor...
3  Jailbirds New Orleans  Feuds, flirtations and toilet talk go down amo...
4           Kota Factory  In a city of coaching centers known to train I...


# Create Text Embedding

In [52]:
# Load the embedding model (https://huggingface.co/nomic-ai/nomic-embed-text-v1")
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

# Enable tqdm with pandas
tqdm.pandas()

def get_embedding(text):
   """Generates vector embeddings for the given text."""
   embedding = model.encode(text)
   
   return embedding.tolist()

# Creates embeddings and stores them as a new field
df["description_embedding"] = df["description"].progress_apply(get_embedding)
print(df[["description", "description_embedding"]].head())

<All keys matched successfully>


  0%|          | 0/8807 [00:00<?, ?it/s]

                                         description  \
0  As her father nears the end of his life, filmm...   
1  After crossing paths at a party, a Cape Town t...   
2  To protect his family from a powerful drug lor...   
3  Feuds, flirtations and toilet talk go down amo...   
4  In a city of coaching centers known to train I...   

                               description_embedding  
0  [-0.009882214479148388, 0.0220181941986084, -0...  
1  [0.02989545837044716, -0.01546398364007473, -0...  
2  [0.019135063514113426, -0.004051054362207651, ...  
3  [0.04911187291145325, 0.031073004007339478, -0...  
4  [-0.05711847543716431, 0.057069506496191025, -...  


# Store the data in Atlas

In [53]:
# Get the vector size of the embeddings
vector_size = len(df['description_embedding'].iloc[0])

print(f"The vector size of the embeddings is: {vector_size}")

The vector size of the embeddings is: 768


In [54]:
# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient(mongo_connection_string)

# Ingest data into Atlas
db = mongo_client["netflix_titles"]
collection = db["embedded_titles"]
documents = df.to_dict("records")
collection.insert_many(documents)

InsertManyResult([ObjectId('66948844c8b9da377d7c2074'), ObjectId('66948844c8b9da377d7c2075'), ObjectId('66948844c8b9da377d7c2076'), ObjectId('66948844c8b9da377d7c2077'), ObjectId('66948844c8b9da377d7c2078'), ObjectId('66948844c8b9da377d7c2079'), ObjectId('66948844c8b9da377d7c207a'), ObjectId('66948844c8b9da377d7c207b'), ObjectId('66948844c8b9da377d7c207c'), ObjectId('66948844c8b9da377d7c207d'), ObjectId('66948844c8b9da377d7c207e'), ObjectId('66948844c8b9da377d7c207f'), ObjectId('66948844c8b9da377d7c2080'), ObjectId('66948844c8b9da377d7c2081'), ObjectId('66948844c8b9da377d7c2082'), ObjectId('66948844c8b9da377d7c2083'), ObjectId('66948844c8b9da377d7c2084'), ObjectId('66948844c8b9da377d7c2085'), ObjectId('66948844c8b9da377d7c2086'), ObjectId('66948844c8b9da377d7c2087'), ObjectId('66948844c8b9da377d7c2088'), ObjectId('66948844c8b9da377d7c2089'), ObjectId('66948844c8b9da377d7c208a'), ObjectId('66948844c8b9da377d7c208b'), ObjectId('66948844c8b9da377d7c208c'), ObjectId('66948844c8b9da377d7c20

# Querying the embeddings

In [56]:
# Generate embedding for the search query
query_embedding = get_embedding("romantic comedy fantasy")

# Sample vector search pipeline
pipeline = [
   {
      "$vectorSearch": {
            "index": "netflix_titles_description_vector_index",
            "queryVector": query_embedding,
            "path": "description_embedding",
            "numCandidates": 100,
            "limit": 5
      }
   },
   {
      "$project": {
         "_id": 0,
         "title": 1,
         "description": 1,
         "score": {
            "$meta": "vectorSearchScore"
         }
      }
   }
]
# Execute the search
results = collection.aggregate(pipeline)
# Print results
for i in results:
   print(i)

{'title': 'Galavant', 'description': 'In a time of legend, Galavant the knight embarks on a quest to save his fair lady and become a hero in this tongue-in-cheek musical comedy.', 'score': 0.7886205315589905}
{'title': 'One More Time', 'description': 'This fantasy drama follows an indie band singer who repeatedly undergoes unwanted time slips and the girlfriend he must save from an unlucky fate.', 'score': 0.7854629755020142}
{'title': 'Pek Yakında', 'description': 'An ex-movie extra plans to win his wife back by making a fantasy film, but neither he nor his crew has a knack for filmmaking.', 'score': 0.7843626737594604}
{'title': 'Kaake Da Viyah', 'description': 'In this zany comedy, a man is torn between the girl he loves and the respective women his warring mother and grandmother have chosen for him to marry.', 'score': 0.7805248498916626}
{'title': 'Maine Pyaar Kyun Kiya', 'description': 'After lying to his girlfriend, orthopedic surgeon Samir must conjure up a pretend wife in this