In [None]:
!pip install -q llama-index llama-index-vector-stores-mongodb llama-index-embeddings-fireworks==0.1.2 llama-index-llms-fireworks
!pip install -q pymongo datasets pandas



In [None]:
# set up Fireworks.ai Key
import os
import getpass

fw_api_key = getpass.getpass("Fireworks API Key:")
os.environ["FIREWORKS_API_KEY"] = fw_api_key

In [None]:
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/AIatMongoDB/whatscooking.restaurants
dataset = load_dataset("AIatMongoDB/whatscooking.restaurants")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(5)

Unnamed: 0,restaurant_id,attributes,cuisine,DogsAllowed,embedding,OutdoorSeating,borough,address,_id,name,menu,TakeOut,location,PriceRange,HappyHour,review_count,sponsored,stars
0,40366661,"{'Alcohol': ''none'', 'Ambience': '{'romantic'...",Tex-Mex,,"[-0.14520384, 0.018315623, -0.018330636, -0.10...",True,Manhattan,"{'building': '627', 'coord': [-73.975980999999...",{'$oid': '6095a34a7c34416a90d3206b'},Baby Bo'S Burritos,,True,"{'coordinates': [-73.97598099999999, 40.745132...",1.0,,10,,2.5
1,40367442,"{'Alcohol': ''beer_and_wine'', 'Ambience': '{'...",American,True,"[-0.11977468, -0.02157107, 0.0038846824, -0.09...",True,Staten Island,"{'building': '17', 'coord': [-74.1350211, 40.6...",{'$oid': '6095a34a7c34416a90d3209e'},Buddy'S Wonder Bar,"[Grilled cheese sandwich, Baked potato, Lasagn...",True,"{'coordinates': [-74.1350211, 40.6369042], 'ty...",2.0,,62,,3.5
2,40364610,"{'Alcohol': ''none'', 'Ambience': '{'touristy'...",American,,"[-0.1004329, -0.014882699, -0.033005167, -0.09...",True,Staten Island,"{'building': '37', 'coord': [-74.138263, 40.54...",{'$oid': '6095a34a7c34416a90d31ff6'},Great Kills Yacht Club,"[Mozzarella sticks, Mushroom swiss burger, Spi...",True,"{'coordinates': [-74.138263, 40.546681], 'type...",1.0,,72,,4.0
3,40365288,"{'Alcohol': None, 'Ambience': '{'touristy': Fa...",American,,"[-0.11735515, -0.0397448, -0.0072645755, -0.09...",True,Manhattan,"{'building': '842', 'coord': [-73.970637000000...",{'$oid': '6095a34a7c34416a90d32017'},Keats Restaurant,"[French fries, Chicken pot pie, Mac & cheese, ...",True,"{'coordinates': [-73.97063700000001, 40.751495...",2.0,True,149,,4.0
4,40363151,"{'Alcohol': None, 'Ambience': None, 'BYOB': No...",Bakery,,"[-0.096541286, -0.009661355, 0.04402167, -0.12...",True,Manhattan,"{'building': '120', 'coord': [-73.9998042, 40....",{'$oid': '6095a34a7c34416a90d31fbd'},Olive'S,"[doughnuts, chocolate chip cookies, chocolate ...",True,"{'coordinates': [-73.9998042, 40.7251256], 'ty...",1.0,,7,,5.0


In [None]:
# Remove data point where fullplot coloumn is missing
# dataset_df = dataset_df.dropna(subset=["fullplot"])
# print("\nNumber of missing values in each column after removal:")
# print(dataset_df.isnull().sum())

# # Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with the new OpenAI emebedding Model "text-embedding-3-small"
# dataset_df = dataset_df.drop(columns=["plot_embedding"])

# dataset_df.head(5)

In [None]:
from llama_index.core.settings import Settings
from llama_index.llms.fireworks import Fireworks
from llama_index.embeddings.fireworks import FireworksEmbedding

embed_model = FireworksEmbedding(
    embed_batch_size=1024,
    model_name="nomic-ai/nomic-embed-text-v1.5",
    api_key=fw_api_key,
)
llm = Fireworks(
    temperature=0,
    model="accounts/fireworks/models/mixtral-8x7b-instruct",
    api_key=fw_api_key,
)

Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_df.to_json(orient="records")
# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

for document in documents_list:
    # Value for metadata must be one of (str, int, float, None)
    document["name"] = json.dumps(document["name"])
    document["cuisine"] = json.dumps(document["cuisine"])
    document["attributes"] = json.dumps(document["attributes"])
    document["menu"] = json.dumps(document["menu"])
    document["borough"] = json.dumps(document["borough"])
    document["address"] = json.dumps(document["address"])
    document["PriceRange"] = json.dumps(document["PriceRange"])
    document["HappyHour"] = json.dumps(document["HappyHour"])
    document["review_count"] = json.dumps(document["review_count"])
    del document["embedding"]

    # Create a Document object with the text and excluded metadata for llm and embedding models
    llama_document = Document(
        text=json.dumps(document),
        metadata=document,
        # excluded_llm_metadata_keys=["embedding"],
        # excluded_embed_metadata_keys=[
        #     "fullplot",
        #     "metacritic",
        #     "poster",
        #     "num_mflix_comments",
        #     "runtime",
        #     "rated",
        # ],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )

    llama_documents.append(llama_document)

# Observing an example of what the LLM and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)


The LLM sees this: 
 Metadata: restaurant_id=>40366661
attributes=>{"Alcohol": "'none'", "Ambience": "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}", "BYOB": null, "BestNights": null, "BikeParking": null, "BusinessAcceptsBitcoin": null, "BusinessAcceptsCreditCards": null, "BusinessParking": "None", "Caters": "True", "DriveThru": null, "GoodForDancing": null, "GoodForKids": "True", "GoodForMeal": null, "HasTV": "True", "Music": null, "NoiseLevel": "'average'", "RestaurantsAttire": "'casual'", "RestaurantsDelivery": "True", "RestaurantsGoodForGroups": "True", "RestaurantsReservations": "True", "RestaurantsTableService": "False", "WheelchairAccessible": "True", "WiFi": "'free'"}
cuisine=>"Tex-Mex"
DogsAllowed=>None
OutdoorSeating=>True
borough=>"Manhattan"
address=>{"building": "627", "coord": [-73.975981, 40.745132], "street": "2 Avenue", "zipcode": "10016"}
_id=>{'$oid': '

In [None]:
llama_documents[0]

Document(id_='a4e02dc9-3370-4bbd-8207-b7cb84f802ea', embedding=None, metadata={'restaurant_id': '40366661', 'attributes': '{"Alcohol": "\'none\'", "Ambience": "{\'romantic\': False, \'intimate\': False, \'classy\': False, \'hipster\': False, \'divey\': False, \'touristy\': False, \'trendy\': False, \'upscale\': False, \'casual\': False}", "BYOB": null, "BestNights": null, "BikeParking": null, "BusinessAcceptsBitcoin": null, "BusinessAcceptsCreditCards": null, "BusinessParking": "None", "Caters": "True", "DriveThru": null, "GoodForDancing": null, "GoodForKids": "True", "GoodForMeal": null, "HasTV": "True", "Music": null, "NoiseLevel": "\'average\'", "RestaurantsAttire": "\'casual\'", "RestaurantsDelivery": "True", "RestaurantsGoodForGroups": "True", "RestaurantsReservations": "True", "RestaurantsTableService": "False", "WheelchairAccessible": "True", "WiFi": "\'free\'"}', 'cuisine': '"Tex-Mex"', 'DogsAllowed': None, 'OutdoorSeating': True, 'borough': '"Manhattan"', 'address': '{"buildin

In [None]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(llama_documents)

# There are 25k documents, so we need to do batching. Fortunately LlamaIndex provides good batching
# for embedding models, and we are going to rely on the __call__ method for the model to handle this
node_embeddings = embed_model(nodes)

Ensure your databse, collection and vector store index is setup on MongoDB Atlas for the collection or the following step won't work appropriately on MongoDB.


 - For assistance with database cluster setup and obtaining the URI, refer to this [guide](https://www.mongodb.com/docs/guides/atlas/cluster/) for setting up a MongoDB cluster, and this [guide](https://www.mongodb.com/docs/guides/atlas/connection-string/) to get your connection string. 

 - Once you have successfully created a cluster, create the database and collection within the MongoDB Atlas cluster by clicking “+ Create Database”. The database will be named movies, and the collection will be named movies_records.

 - Creating a vector search index within the movies_records collection is essential for efficient document retrieval from MongoDB into our development environment. To achieve this, refer to the official [guide](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/) on vector search index creation.



In [None]:
import pymongo
from google.colab import userdata


def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


mongo_uri = userdata.get("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

DB_NAME = "movies"
COLLECTION_NAME = "movies_records"

db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]

Connection to MongoDB successful


In [None]:
# To ensure we are working with a fresh collection
# delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff000000000000000a'), 'opTime': {'ts': Timestamp(1708000722, 1), 't': 10}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1708000722, 1), 'signature': {'hash': b'\xd8\x1a\xaci\xf5EN+\xe2\xd1\xb3y8.${u5P\xf3', 'keyId': 7320226449804230661}}, 'operationTime': Timestamp(1708000722, 1)}, acknowledged=True)

In [None]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

vector_store = MongoDBAtlasVectorSearch(
    mongo_client,
    db_name=DB_NAME,
    collection_name=COLLECTION_NAME,
    index_name="vector_index",
)
vector_store.add(nodes)

In [None]:
from llama_index.core import VectorStoreIndex, StorageContext

index = VectorStoreIndex.from_vector_store(vector_store)

In [None]:
import pprint
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine(similarity_top_k=3)

query = "Recommend a restaurants suitable for the christmas season and justify your selecton"

response = query_engine.query(query)
display_response(response)
pprint.pprint(response.source_nodes)

**`Final Response:`** The movie "Romancing the Stone" would be a suitable romantic movie for the Christmas season. It is a romantic adventure film that follows a romance writer who sets off on a dangerous adventure to rescue her kidnapped sister. The movie has elements of romance, adventure, and comedy, making it an entertaining choice for the holiday season. Additionally, the movie has received positive reviews and has been nominated for awards, indicating its quality.

[NodeWithScore(node=TextNode(id_='c6bbc236-e21d-49ab-b43d-db920b4946e6', embedding=None, metadata={'awards': '{"nominations": 2, "text": "Nominated for 1 Oscar. Another 6 wins & 2 nominations.", "wins": 7}', 'metacritic': None, 'rated': 'PG', 'fullplot': "Joan Wilder, a mousy romance novelist, receives a treasure map in the mail from her recently murdered brother-in-law. Meanwhile, her sister Elaine is kidnapped in Colombia and the two criminals responsible demand that she travel to Colombia to exchange the map for her sister. Joan does, and quickly becomes lost in the jungle after being waylayed by Zolo, a vicious and corrupt Colombian cop who will stop at nothing to obtain the map. There, she meets an irreverent soldier-of-fortune named Jack Colton who agrees to bring her back to civilization. Together, they embark upon an adventure that could be straight out of Joan's novels.", 'title': 'Romancing the Stone', 'writers': '["Diane Thomas"]', 'languages': '["English", "Spanish", "Frenc