In [0]:
dbutils.widgets.text("collection_name", "", "Collection Name")
dbutils.widgets.text("output_path", "", "Output Path")
dbutils.widgets.text("database_name", "", "Database Name")

In [0]:
import json
import logging
from pathlib import Path

from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, OperationFailure



In [0]:
database_name = dbutils.widgets.get("database_name")
collection_name = dbutils.widgets.get("collection_name")
output_path = dbutils.widgets.get("output_path")


In [0]:
uri = dbutils.secrets.get(scope="crossy", key="mongodb_uri")

In [0]:
# Validate output path
output_file = Path(output_path)
output_dir = str(output_file.parent)
print(output_file)
print(output_file)

# Create output directory if it doesn't exist
dbutils.fs.mkdirs(output_dir)
print("yes") 


In [0]:
print(f"Connecting to MongoDB: {uri}")
client = MongoClient(uri)
        
# Test connection
client.admin.command('ping')
print("Successfully connected to MongoDB")       
        

In [0]:
# Get database and collection
db = client[database_name]
collection = db[collection_name]
        
# Check if collection exists
if collection_name not in db.list_collection_names():
    print(f"Collection '{collection_name}' does not exist in database '{database_name}'")
else:
    print(f"Collection '{collection_name}' does exist in database '{database_name}'")
        

        

In [0]:
# Get collection count for progress tracking
total_docs = collection.count_documents({})
print(f"Found {total_docs} documents in collection '{collection_name}'")

if total_docs == 0:
    print(f"Collection '{collection_name}' is empty")
    # Create empty file
    output_file.touch()
    client.close()
        

In [0]:
# Export documents to JSONL file
print(f"Exporting to {output_file}")
exported_count = 0

In [0]:
with open(output_file, 'w', encoding='utf-8') as f:
    cursor = collection.find({})
              
    for doc in cursor:
        # Convert ObjectId and other MongoDB types to JSON-serializable format
        doc_json = json.dumps(doc, default=str, ensure_ascii=False)
        f.write(doc_json + '\n')
        exported_count += 1
                    
        # Log progress every 10 documents
        if exported_count % 10 == 0:
            print(f"Exported {exported_count}/{total_docs} documents...")

print(f"Successfully exported {exported_count} documents")             


In [0]:
# Log file size
file_size = output_file.stat().st_size
file_size_mb = file_size / (1024 * 1024)
print(f"Output file size: {file_size_mb:.2f} MB ({file_size:,} bytes)")