# MongoDB Atlas + OpenAI RAG Example

In [None]:
!pip install llama-index
!pip install llama-index-vector-stores-mongodb
!pip install llama-index-embeddings-openai
!pip install pymongo
!pip install datasets
!pip install pandas

In [None]:
%env OPENAI_API_KEY=OPENAI_API_KEY

In [None]:
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/AIatMongoDB/embedded_movies
dataset = load_dataset("AIatMongoDB/embedded_movies")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(5)

Unnamed: 0,awards,metacritic,rated,fullplot,title,writers,languages,plot,plot_embedding,runtime,countries,genres,directors,cast,type,imdb,poster,num_mflix_comments
0,"{'nominations': 0, 'text': '1 win.', 'wins': 1}",,,Young Pauline is left a lot of money when her ...,The Perils of Pauline,"[Charles W. Goddard (screenplay), Basil Dickey...",[English],Young Pauline is left a lot of money when her ...,"[0.00072939653, -0.026834568, 0.013515796, -0....",199.0,[USA],[Action],"[Louis J. Gasnier, Donald MacKenzie]","[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",movie,"{'id': 4465, 'rating': 7.6, 'votes': 744}",https://m.media-amazon.com/images/M/MV5BMzgxOD...,0
1,"{'nominations': 1, 'text': '1 nomination.', 'w...",,TV-G,As a penniless man worries about how he will m...,From Hand to Mouth,[H.M. Walker (titles)],[English],A penniless young man tries to save an heiress...,"[-0.022837115, -0.022941574, 0.014937485, -0.0...",22.0,[USA],"[Comedy, Short, Action]","[Alfred J. Goulding, Hal Roach]","[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",movie,"{'id': 10146, 'rating': 7.0, 'votes': 639}",https://m.media-amazon.com/images/M/MV5BNzE1OW...,0
2,"{'nominations': 0, 'text': '1 win.', 'wins': 1}",,,"Michael ""Beau"" Geste leaves England in disgrac...",Beau Geste,"[Herbert Brenon (adaptation), John Russell (ad...",[English],"Michael ""Beau"" Geste leaves England in disgrac...","[0.00023330493, -0.028511643, 0.014653289, -0....",101.0,[USA],"[Action, Adventure, Drama]",[Herbert Brenon],"[Ronald Colman, Neil Hamilton, Ralph Forbes, A...",movie,"{'id': 16634, 'rating': 6.9, 'votes': 222}",,0
3,"{'nominations': 0, 'text': '1 win.', 'wins': 1}",,,A nobleman vows to avenge the death of his fat...,The Black Pirate,"[Douglas Fairbanks (story), Jack Cunningham (a...",,"Seeking revenge, an athletic young man joins t...","[-0.005927917, -0.033394486, 0.0015323418, -0....",88.0,[USA],"[Adventure, Action]",[Albert Parker],"[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",movie,"{'id': 16654, 'rating': 7.2, 'votes': 1146}",https://m.media-amazon.com/images/M/MV5BMzU0ND...,1
4,"{'nominations': 1, 'text': '1 nomination.', 'w...",,PASSED,"The Uptown Boy, J. Harold Manners (Lloyd) is a...",For Heaven's Sake,"[Ted Wilde (story), John Grey (story), Clyde B...",[English],An irresponsible young millionaire changes his...,"[-0.0059373598, -0.026604708, -0.0070914757, -...",58.0,[USA],"[Action, Comedy, Romance]",[Sam Taylor],"[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...",movie,"{'id': 16895, 'rating': 7.6, 'votes': 918}",https://m.media-amazon.com/images/M/MV5BMTcxMT...,0


In [None]:
# Remove data point where fullplot coloumn is missing
dataset_df = dataset_df.dropna(subset=["fullplot"])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with the new OpenAI emebedding Model "text-embedding-3-small"
dataset_df = dataset_df.drop(columns=["plot_embedding"])

dataset_df.head(5)


Number of missing values in each column after removal:
awards                  0
metacritic            893
rated                 279
fullplot                0
title                   0
writers                13
languages               1
plot                    0
plot_embedding          1
runtime                14
countries               0
genres                  0
directors              12
cast                    1
type                    0
imdb                    0
poster                 78
num_mflix_comments      0
dtype: int64


Unnamed: 0,awards,metacritic,rated,fullplot,title,writers,languages,plot,runtime,countries,genres,directors,cast,type,imdb,poster,num_mflix_comments
0,"{'nominations': 0, 'text': '1 win.', 'wins': 1}",,,Young Pauline is left a lot of money when her ...,The Perils of Pauline,"[Charles W. Goddard (screenplay), Basil Dickey...",[English],Young Pauline is left a lot of money when her ...,199.0,[USA],[Action],"[Louis J. Gasnier, Donald MacKenzie]","[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",movie,"{'id': 4465, 'rating': 7.6, 'votes': 744}",https://m.media-amazon.com/images/M/MV5BMzgxOD...,0
1,"{'nominations': 1, 'text': '1 nomination.', 'w...",,TV-G,As a penniless man worries about how he will m...,From Hand to Mouth,[H.M. Walker (titles)],[English],A penniless young man tries to save an heiress...,22.0,[USA],"[Comedy, Short, Action]","[Alfred J. Goulding, Hal Roach]","[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",movie,"{'id': 10146, 'rating': 7.0, 'votes': 639}",https://m.media-amazon.com/images/M/MV5BNzE1OW...,0
2,"{'nominations': 0, 'text': '1 win.', 'wins': 1}",,,"Michael ""Beau"" Geste leaves England in disgrac...",Beau Geste,"[Herbert Brenon (adaptation), John Russell (ad...",[English],"Michael ""Beau"" Geste leaves England in disgrac...",101.0,[USA],"[Action, Adventure, Drama]",[Herbert Brenon],"[Ronald Colman, Neil Hamilton, Ralph Forbes, A...",movie,"{'id': 16634, 'rating': 6.9, 'votes': 222}",,0
3,"{'nominations': 0, 'text': '1 win.', 'wins': 1}",,,A nobleman vows to avenge the death of his fat...,The Black Pirate,"[Douglas Fairbanks (story), Jack Cunningham (a...",,"Seeking revenge, an athletic young man joins t...",88.0,[USA],"[Adventure, Action]",[Albert Parker],"[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",movie,"{'id': 16654, 'rating': 7.2, 'votes': 1146}",https://m.media-amazon.com/images/M/MV5BMzU0ND...,1
4,"{'nominations': 1, 'text': '1 nomination.', 'w...",,PASSED,"The Uptown Boy, J. Harold Manners (Lloyd) is a...",For Heaven's Sake,"[Ted Wilde (story), John Grey (story), Clyde B...",[English],An irresponsible young millionaire changes his...,58.0,[USA],"[Action, Comedy, Romance]",[Sam Taylor],"[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...",movie,"{'id': 16895, 'rating': 7.6, 'votes': 918}",https://m.media-amazon.com/images/M/MV5BMTcxMT...,0


In [None]:
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=256)
llm = OpenAI()

Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_df.to_json(orient="records")
# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

for document in documents_list:
    # Value for metadata must be one of (str, int, float, None)
    document["writers"] = json.dumps(document["writers"])
    document["languages"] = json.dumps(document["languages"])
    document["genres"] = json.dumps(document["genres"])
    document["cast"] = json.dumps(document["cast"])
    document["directors"] = json.dumps(document["directors"])
    document["countries"] = json.dumps(document["countries"])
    document["imdb"] = json.dumps(document["imdb"])
    document["awards"] = json.dumps(document["awards"])

    # Create a Document object with the text and excluded metadata for llm and embedding models
    llama_document = Document(
        text=document["fullplot"],
        metadata=document,
        excluded_llm_metadata_keys=["fullplot", "metacritic"],
        excluded_embed_metadata_keys=[
            "fullplot",
            "metacritic",
            "poster",
            "num_mflix_comments",
            "runtime",
            "rated",
        ],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )

    llama_documents.append(llama_document)

# Observing an example of what the LLM and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)


The LLM sees this: 
 Metadata: awards=>{"nominations": 0, "text": "1 win.", "wins": 1}
rated=>None
title=>The Perils of Pauline
writers=>["Charles W. Goddard (screenplay)", "Basil Dickey (screenplay)", "Charles W. Goddard (novel)", "George B. Seitz", "Bertram Millhauser"]
languages=>["English"]
plot=>Young Pauline is left a lot of money when her wealthy uncle dies. However, her uncle's secretary has been named as her guardian until she marries, at which time she will officially take ...
runtime=>199.0
countries=>["USA"]
genres=>["Action"]
directors=>["Louis J. Gasnier", "Donald MacKenzie"]
cast=>["Pearl White", "Crane Wilbur", "Paul Panzer", "Edward Jos\u00e8"]
type=>movie
imdb=>{"id": 4465, "rating": 7.6, "votes": 744}
poster=>https://m.media-amazon.com/images/M/MV5BMzgxODk1Mzk2Ml5BMl5BanBnXkFtZTgwMDg0NzkwMjE@._V1_SY1000_SX677_AL_.jpg
num_mflix_comments=>0
-----
Content: Young Pauline is left a lot of money when her wealthy uncle dies. However, her uncle's secretary has been named as

In [None]:
llama_documents[0]

In [None]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(llama_documents)

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

Ensure your databse, collection and vector store index is setup on MongoDB Atlas for the collection or the following step won't work appropriately on MongoDB.


 - For assistance with database cluster setup and obtaining the URI, refer to this [guide](https://www.mongodb.com/docs/guides/atlas/cluster/) for setting up a MongoDB cluster, and this [guide](https://www.mongodb.com/docs/guides/atlas/connection-string/) to get your connection string. 

 - Once you have successfully created a cluster, create the database and collection within the MongoDB Atlas cluster by clicking “+ Create Database”. The database will be named movies, and the collection will be named movies_records.

 - Creating a vector search index within the movies_records collection is essential for efficient document retrieval from MongoDB into our development environment. To achieve this, refer to the official [guide](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/) on vector search index creation.



In [None]:
import pymongo
from google.colab import userdata


def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


mongo_uri = userdata.get("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

DB_NAME = "movies"
COLLECTION_NAME = "movies_records"

db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]

Connection to MongoDB successful


In [None]:
# To ensure we are working with a fresh collection
# delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff000000000000000a'), 'opTime': {'ts': Timestamp(1708000722, 1), 't': 10}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1708000722, 1), 'signature': {'hash': b'\xd8\x1a\xaci\xf5EN+\xe2\xd1\xb3y8.${u5P\xf3', 'keyId': 7320226449804230661}}, 'operationTime': Timestamp(1708000722, 1)}, acknowledged=True)

In [None]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

vector_store = MongoDBAtlasVectorSearch(
    mongo_client,
    db_name=DB_NAME,
    collection_name=COLLECTION_NAME,
    index_name="vector_index",
)
vector_store.add(nodes)

In [None]:
from llama_index.core import VectorStoreIndex, StorageContext

index = VectorStoreIndex.from_vector_store(vector_store)

In [None]:
import pprint
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine(similarity_top_k=3)

query = "Recommend a romantic movie suitable for the christmas season and justify your selecton"

response = query_engine.query(query)
display_response(response)
pprint.pprint(response.source_nodes)

**`Final Response:`** The movie "Romancing the Stone" would be a suitable romantic movie for the Christmas season. It is a romantic adventure film that follows a romance writer who sets off on a dangerous adventure to rescue her kidnapped sister. The movie has elements of romance, adventure, and comedy, making it an entertaining choice for the holiday season. Additionally, the movie has received positive reviews and has been nominated for awards, indicating its quality.

[NodeWithScore(node=TextNode(id_='c6bbc236-e21d-49ab-b43d-db920b4946e6', embedding=None, metadata={'awards': '{"nominations": 2, "text": "Nominated for 1 Oscar. Another 6 wins & 2 nominations.", "wins": 7}', 'metacritic': None, 'rated': 'PG', 'fullplot': "Joan Wilder, a mousy romance novelist, receives a treasure map in the mail from her recently murdered brother-in-law. Meanwhile, her sister Elaine is kidnapped in Colombia and the two criminals responsible demand that she travel to Colombia to exchange the map for her sister. Joan does, and quickly becomes lost in the jungle after being waylayed by Zolo, a vicious and corrupt Colombian cop who will stop at nothing to obtain the map. There, she meets an irreverent soldier-of-fortune named Jack Colton who agrees to bring her back to civilization. Together, they embark upon an adventure that could be straight out of Joan's novels.", 'title': 'Romancing the Stone', 'writers': '["Diane Thomas"]', 'languages': '["English", "Spanish", "Frenc