# Load required modules

In [None]:
from pymongo import MongoClient
from pymongo.database import Database
import pandas as pd

# Connect to the NoSQL database
You'll need to build the connection string based on the information in your MongoDB Atlas cluster.

In [None]:
connection_string = "PUT HERE THE CONNECTION STRING FROM ATLAS"

client = MongoClient(connection_string)
my_second_db = Database(client, 'my_second_db')  # connect to a database


# Insert data into a new table 'customer'

In [None]:
# MongoDB stores documents in the JSON format - here is an example of a document that corresponds to a row in a relational DB
docs = [{
    'customer_id' : 1,
    'region' : 1
},
    {
    'customer_id' : 2,
    'region' : 2
},
    {
    'customer_id' : 3,
    'region' : 2
},
    {
    'customer_id' : 4,
    'region' : 2
},
    {
    'customer_id' : 5,
    'region' : 1
},
    
    
]


my_second_db.customer.insert_many(docs)  # insert the document into the collection 'lieferant' (which is created automatically)

# Read the inserted data

In [None]:
cursor = my_second_db.customer.find()

df = pd.DataFrame(list(cursor))  # convert to a dataframe for better visualization
df

# Check which data is available on the single nodes
## Node 0
MongoDB calls nodes "shards".

In [None]:
# remove the shards 01 and 02 from the connection string
connection_string_shard0 = "PUT HERE THE CONNECTION STRING FROM ATLAS"


client_shard0 = MongoClient(connection_string_shard0)
my_second_db_shard0 = Database(client_shard0, 'my_second_db')  # connect to a database

cursor = my_second_db_shard0.customer.find()

df = pd.DataFrame(list(cursor))  # convert to a dataframe for better visualization
df

## Node 1

In [None]:
# remove the shards 00 and 02 from the connection string
connection_string_shard1 = "PUT HERE THE CONNECTION STRING FROM ATLAS"


client_shard1 = MongoClient(connection_string_shard1)
my_second_db_shard1 = Database(client_shard1, 'my_second_db')  # connect to a database

cursor = my_second_db_shard1.customer.find()

df = pd.DataFrame(list(cursor))  # convert to a dataframe for better visualization
df

## Node 2

In [None]:
# remove the shards 00 and 01 from the connection string
connection_string_shard2 = "PUT HERE THE CONNECTION STRING FROM ATLAS"

client_shard2 = MongoClient(connection_string_shard2)
my_second_db_shard2 = Database(client_shard2, 'my_second_db')  # connect to a database

cursor = my_second_db_shard2.customer.find()

df = pd.DataFrame(list(cursor))  # convert to a dataframe for better visualization
df

# Result: Replication is Active
Our MongoDB cluster replicates the data to all three nodes. This is called replication and our MongoDB is configured such that the three nodes build a replica set for our database. Replication allows availability of the database in case one (or two) nodes are down. Furthermore, it allows to spread read accesses to the data among the three nodes and hence improving (i.e. lowering) latency

# How about Horizontal Scaling?
Our free tier of MongoDB Atlas that we are using does not provide the functionality of "sharding" data. Sharding accounts for distributing data on different nodes (e.g. based on the region - put region 1 data on node 1 and region 2 data on node 2). Thus, horizontal scaling of this free cluster is not given, since we will always see all data on the three nodes (due to replication). <br> <br>
By the way: Sharding with three nodes and a three-node replication set does not make sense at all.
In a large scale commercial setup we would for example use 6 nodes with two replica sets. Three of the nodes hold the data for region 1, and three of the nodes hold the data for region 2.
<br><br>
<b>The Atlas MongoDB free tier is, hence, not made for scaling but just for getting to know MongoDB.</b>

## Let us clean up

In [None]:
my_second_db.lieferant.delete_many({})