# NavigatorGPT - MongoDB Processing

In [None]:
pip install openpyxl

In [661]:
import json
import pymongo
import pandas as pd
import pprint
import os
from pymongo import MongoClient, UpdateOne
from bson.objectid import ObjectId
from datetime import datetime

# Load the MongoDB connection string from the JSON file
PATH_TO_SECRET_JSON = '/home/jovyan/keys/mongodb_key.json'
with open(PATH_TO_SECRET_JSON) as f:
    MONGODB_URI = json.load(f)['connection_string']

# Extract the database name from the connection string
db_name = MONGODB_URI.split('/')[-1].split('?')[0]

# Create a MongoDB client using the connection string
if MONGODB_URI:
    client = MongoClient(MONGODB_URI)
    print("The MongoDB client has been initialized.")
else:
    print("Failed to initialize the MongoDB client.")

# Print the version of the pymongo package
pymongo_version = pymongo.__version__
print(f"The version of the pymongo package is {pymongo_version}")

# Connect to the specified MongoDB database and collection
db = client[db_name]
collection = db.attractions

The MongoDB client has been initialized.
The version of the pymongo package is 4.4.1


In [None]:
# Find the document with the specified Objectid in the 'attractions' collection
document = collection.find_one({"_id": ObjectId("6450c55c3879cf70e53ff044")})

# Print the retrieved document
pprint.pprint(document)


**Description:**
We have a dataset of attractions with specific fields to be updated in the database. Below is the dataset:

In [284]:
data = [
    {
        "id": "6432ea18a90bce6b6c83bacd",
        "landmark.lpNumber": "LP-00597",
        "landmark.landmarkType": "Individual Landmark",
        "landmark.designationDate": {
            "$date": "1970-08-18T00:00:00"
        }
    },
    {
        "id": "6451f7e53879cf70e53ff088",
        "landmark.lpNumber": "LP-00599",
        "landmark.landmarkType": "Individual Landmark",
        "landmark.designationDate": {
            "$date": "1970-03-31T00:00:00"
        }
    },
]



In [None]:
data[:5]

In [663]:
# Read the JSON array from the file
with open("../data/attractions/Location-Import_2023_10_02_03_05_00.json", "r") as file:
    data = json.load(file)
    
data[:5]

[{'id': '64698bdd9e866c1d68eb577b', 'loc.bbl': 1008710010},
 {'id': '64a4b36ffc95c50bc98dcf99', 'loc.bbl': 3070710130},
 {'id': '645bf5b4e6fcaf3f17faafe6', 'loc.bbl': 1012580040},
 {'id': '64794de4426b0f007933dd03', 'loc.bbl': 1003920010},
 {'id': '6477aee6426b0f007933dce8', 'loc.bbl': 5000630050}]

**Description:**
Now, we'll loop through the dataset and update the records in the MongoDB collection.

In [665]:
from datetime import datetime

# Initialize counters and operations list
totalRecords = len(data)
recordsUpdated = 0
operations = []

# Prepare bulk update operations for each record in data
for item in data:
    update_fields = {}

    # Iterate over all keys in the item
    for key, value in item.items():
        # Skip 'id' key
        if key == 'id':
            continue

        # Check if the value is a dictionary (nested dictionary in MongoDB)
        if isinstance(value, dict):
            for sub_key, sub_value in value.items():
                # If the sub_key is "$date", convert the value to datetime
                if sub_key == "$date":
                    sub_value = datetime.fromisoformat(sub_value)
                    key = key.replace(".$date", "")
                update_fields[f"{key}"] = sub_value
        else:
            update_fields[key] = value

    # Create the update operation
    operations.append(
        UpdateOne(
            {"_id": ObjectId(item['id'])},
            {"$set": update_fields},
            upsert=True
        )
    )

# Execute the update operations in bulk
result = collection.bulk_write(operations)

# Calculate the total records updated
recordsUpdated = result.modified_count + len(result.upserted_ids)

# Output the results
print(f"Total records: {totalRecords}")
print(f"Records updated: {recordsUpdated}")


Total records: 67
Records updated: 0


**Description:**
Print out the results, showing the total number of records processed and the number of records updated.

In [652]:
print(f"Total records: {totalRecords}")
print(f"Records updated: {recordsUpdated}")

Total records: 37
Records updated: 17
