<a href="https://colab.research.google.com/github/stephenfrein/csc8491/blob/main/MongoClusterExamples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install a Python library that interacts with MongoDB
!pip install pymongo


In [None]:
# set up your credentials - same as your Oracle credentials
username = ''
password = ''

In [None]:
# test your connection - if it fails, check username/password
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
uri = "mongodb+srv://"+username+":"+password+"@cluster0.yrg2exm.mongodb.net"
# create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [None]:
# OBSERVE THE PROPAGATION DELAY IN A CLUSTER

import datetime
import pandas as pd
import threading
import time
from pymongo import ReadPreference
from pymongo import WriteConcern

# transactions in a CSV file in an AWS S3 bucket
transactions_url = 'https://csc8491.s3.amazonaws.com/mongo_transactions_24.csv'
# name of collection
trans_collection = 'transactions'

# you have a database that matches your username
db = client[username]
collection = db[trans_collection]

# get rid of collection if it already exists
try:
  collection.drop()
except e:
  print(e)

# function to load our collection from the S3 file - we'll run this in a thread
def load_collection(collection, filename):
    data = pd.read_csv(filename, header=0)
    collection.insert_many(data.to_dict('records'))

# function to get the count in a collection - we'll read this for both the primary and a secondary
def get_count(collection, type):
  # we read it 20 times in a row to compare how loading progresses for the primary or the secondary
  for i in range(1,21):
    print('\n' + type + ' ' + str(i) + ': ' + str(collection.count_documents({})) + ' ' + str(datetime.datetime.now()))

# reference to the collection for loading
trans_load = db.get_collection(trans_collection)
# reference to the collection as it exists on the primary node - we will use for reading
trans_primary = db.get_collection(trans_collection, read_preference=ReadPreference.PRIMARY)
# reference to the collection as it exists on a secondary node - we will use for reading
trans_secondary = db.get_collection(trans_collection, read_preference=ReadPreference.SECONDARY)

# thread for loading data
t_load_data = threading.Thread(target=load_collection, args=(trans_load,transactions_url,))
# thread for reading from primary
t_count_primary = threading.Thread(target=get_count, args=(trans_primary,'Primary',))
# thread for reading froms secondary
t_count_secondary = threading.Thread(target=get_count, args=(trans_secondary,'Secondary',))

# start the clock
start = time.time()

# starting load thread
t_load_data.start()
# give load time to get moving
time.sleep(10)
# start reading from primary
t_count_primary.start()
time.sleep(4)
# start reading from secondary
t_count_secondary.start()


# wait until threads are completely executed
t_load_data.join()
t_count_primary.join()
t_count_secondary.join()

# stop the clock
end = time.time()
print("Run took: " + str(end - start) + ' seconds')

# what are our final numbers once load is complete?
print('Final Primary: ' + str(trans_primary.count_documents({})))
print('Final Secondary: ' + str(trans_secondary.count_documents({})))



Primary 1: 0 2024-06-17 20:24:33.269300

Secondary 1: 16320 2024-06-17 20:24:35.295981

Secondary 2: 17728 2024-06-17 20:24:35.488988

Secondary 3: 19392 2024-06-17 20:24:35.684171

Secondary 4: 21440 2024-06-17 20:24:35.892662

Secondary 5: 24128 2024-06-17 20:24:36.112597

Secondary 6: 26624 2024-06-17 20:24:36.326563

Secondary 7: 28928 2024-06-17 20:24:36.544416

Secondary 8: 31168 2024-06-17 20:24:36.750335

Secondary 9: 33728 2024-06-17 20:24:36.995811

Secondary 10: 36224 2024-06-17 20:24:37.232339

Secondary 11: 39616 2024-06-17 20:24:37.554136

Primary 2: 39680 2024-06-17 20:24:37.644099

Secondary 12: 43200 2024-06-17 20:24:37.799665

Secondary 13: 47936 2024-06-17 20:24:38.104698

Primary 3: 48384 2024-06-17 20:24:38.192442

Secondary 14: 49984 2024-06-17 20:24:38.358458

Secondary 15: 51968 2024-06-17 20:24:38.598712

Secondary 16: 55040 2024-06-17 20:24:38.853086

Primary 4: 53312 2024-06-17 20:24:38.864975

Secondary 17: 58112 2024-06-17 20:24:39.144047

Primary 5: 61440

In [None]:
# OBSERVE THE PERFORMANCE IMPACTS OF DIFFERENT WRITE CONCERNS

import pandas as pd
import time
from pymongo import WriteConcern

# transactions in a CSV file in an AWS S3 bucket
transactions_url = 'https://csc8491.s3.amazonaws.com/mongo_transactions_24.csv'
# name of collection
trans_collection = 'transactions'

# you have a database that matches your username
db = client[username]
collection = db[trans_collection]

# get rid of collection if it already exists
try:
  collection.drop()
except e:
  print(e)

# function to load our collection from the S3 file
# do it in a loop to better see performance impacts
def load_collection(collection, filename):
    df = pd.read_csv(filename, header=0)
    df = df.iloc[:1000] # grab first thousand records
    for index, row in df.iterrows():
      collection.insert_one(row.to_dict())

# reference to the collection for loading
# manipulate write concern to affect speed

# default gives majority - 2 nodes in this 3-node cluster (primary + 1 secondary)
trans_load = db.get_collection(trans_collection)
# setting write concern to 0 means no write acknowledgement needed - super fast
#trans_load = db.get_collection(trans_collection, write_concern=WriteConcern(w=0))
# setting write concern to 1 means just the primary has to acknowledge
#trans_load = db.get_collection(trans_collection, write_concern=WriteConcern(w=1))

# start the clock
start = time.time()
# load the data
load_collection(trans_load, transactions_url)
# stop the clock
end = time.time()
print("Run took: " + str(end - start) + ' seconds')

Run took: 0.8675277233123779 seconds
How many docs there: 1000
Ask the secondary: 1000
