In [1]:
#!pip install pymongo

In [2]:
import pymongo

In [3]:
client = pymongo.MongoClient("mongodb://localhost:27017")

In [4]:
# This is the database (document) name
db = client["test"]

In [5]:
# A document isn't created until at least one collection is.
client.list_database_names()

['admin', 'config', 'local']

In [6]:
# This is the table (collection) name
coll = db["testCollection"]

In [7]:
# The collection isn't created until data is inserted
db.list_collection_names()

[]

In [8]:
# MongoDB data takes the form of nested dictionarie or JSON objects
testDict = {"name": "Sam", "Student Number": 42}

In [9]:
res = coll.insert_one(testDict)

In [10]:
client.list_database_names()

['admin', 'config', 'local', 'test']

In [11]:
db.list_collection_names()

['testCollection']

In [12]:
# If the id (_id) is not specified, it is automatically created.
# These ids have to be unique.
res.inserted_id

ObjectId('6384b286a027f26e0334ff13')

In [13]:
# Will give an error, since we're trying to add the same object.
# It was automatically given the same id number

#testDict = {"name": "Sam", "Student Number": 42}

In [14]:
testDict = {"name": "Sam", "Student Number": 42, "_id": 9}

In [15]:
res = coll.insert_one(testDict)

In [16]:
res.inserted_id

9

In [17]:
testDict2 = [
    {"name": "Sean", "Student Number" : 12},
    {"name": "Jessica", "Student Number" : 23},
    {"name": "Mary", "Student Number" : 24},
    {"name": "Naima", "Student Number" : 25},
    {"name": "Caoilfhionn", "Student Number" : 26},
    {"name": "Muhammad", "Student Number" : 27},
    {"name": "David", "Student Number" : 28},
    
]

In [18]:
# Will give an error: we are adding more than one!
#res = coll.insert_one(testDict2)

In [19]:
res = coll.insert_many(testDict2)

In [20]:
res

<pymongo.results.InsertManyResult at 0x1ef3f8df6a0>

In [21]:
res.inserted_ids

[ObjectId('6384b287a027f26e0334ff14'),
 ObjectId('6384b287a027f26e0334ff15'),
 ObjectId('6384b287a027f26e0334ff16'),
 ObjectId('6384b287a027f26e0334ff17'),
 ObjectId('6384b287a027f26e0334ff18'),
 ObjectId('6384b287a027f26e0334ff19'),
 ObjectId('6384b287a027f26e0334ff1a')]

In [22]:
# Leave empty to get all rows
res = coll.find()

# Equivilant calls:
#res = coll.find({})
#res = coll.find({},{})

In [23]:
# Use a dictionary to select only some rows
res = coll.find({"name": "Sam"})

In [24]:
for i in res:
    print(i)

{'_id': ObjectId('6384b286a027f26e0334ff13'), 'name': 'Sam', 'Student Number': 42}
{'_id': 9, 'name': 'Sam', 'Student Number': 42}


In [25]:
# The second dictionary specifies the fields (columns) to return
res = coll.find({"name": "Sam"}, {"name": 0})
for i in res:
    print(i)

{'_id': ObjectId('6384b286a027f26e0334ff13'), 'Student Number': 42}
{'_id': 9, 'Student Number': 42}


1 means True: mongo will return only those columns

0 means false: mongo will return everything **except** those columns

Can only have **either** 1s or 0s

Exception: \_id can be different, as it will always be returned unless set to 0

In [26]:
res = coll.find({"name": "Sam"}, {"name": 1, "_id": 0})
for i in res:
    print(i)

{'name': 'Sam'}
{'name': 'Sam'}


In [27]:
res = coll.find({
    "name": {"$gt": "A", "$lt": "M"} 
})
for i in res:
    print(i)

{'_id': ObjectId('6384b287a027f26e0334ff15'), 'name': 'Jessica', 'Student Number': 23}
{'_id': ObjectId('6384b287a027f26e0334ff18'), 'name': 'Caoilfhionn', 'Student Number': 26}
{'_id': ObjectId('6384b287a027f26e0334ff1a'), 'name': 'David', 'Student Number': 28}


In [28]:
res = coll.find({
    "name": {"$regex": "a$"} 
})
for i in res:
    print(i)

{'_id': ObjectId('6384b287a027f26e0334ff15'), 'name': 'Jessica', 'Student Number': 23}
{'_id': ObjectId('6384b287a027f26e0334ff17'), 'name': 'Naima', 'Student Number': 25}


# Using Actual Data

Tweets about the recent US midterm elections

Source: I left a Tweet Stream running for a few hours (will show how later, hopefully!)

In [29]:
import json

In [30]:
# Change to where you have the file stored
with open(r"C:\Users\sweis\Downloads\Midterms Tweets\Midterms Tweets.json", encoding="utf8") as f:
    tweets = json.load(f)

In [31]:
tweets['0']

{'data': {'edit_history_tweet_ids': ['1590047656314691584'],
  'id': '1590047656314691584',
  'text': 'RT @CologeroGetz: Arizona! Here are some important names to remember and pictures to share on your social media today!\n\nAbe Hamadeh for Att…'},
 'matching_rules': [{'id': '1590047576933687297', 'tag': 'election'}]}

In [32]:
db = client["MidtermTweets"]

In [33]:
coll = db["MidtermTweets1"]

In [34]:
# Will give an error, since the document size would be too big
#coll.insert_many([tweets])

In [35]:
tweets

{'0': {'data': {'edit_history_tweet_ids': ['1590047656314691584'],
   'id': '1590047656314691584',
   'text': 'RT @CologeroGetz: Arizona! Here are some important names to remember and pictures to share on your social media today!\n\nAbe Hamadeh for Att…'},
  'matching_rules': [{'id': '1590047576933687297', 'tag': 'election'}]},
 '1': {'data': {'edit_history_tweet_ids': ['1590047663289823234'],
   'id': '1590047663289823234',
   'text': 'RT @ardenthistorian: Wahltag, das Signal für diverses "was kümmerst du dich um die USA kümmer dich lieber um Deutschland" Gepöbele. Sweet p…'},
  'matching_rules': [{'id': '1590047576933687297', 'tag': 'election'}]},
 '2': {'data': {'edit_history_tweet_ids': ['1590047663453519872'],
   'id': '1590047663453519872',
   'text': '🇺🇸 #Midterms : les Américains aux urnes, entre «peur pour la démocratie» et envie de «sauver» le pays \n\nIls sont des dizaines de millions à avoir attendu le jour J pour exprimer leur choix. Reportage @fredericautran &amp; @julien

In [36]:
len(tweets)

75567

In [37]:
tweets1 = {}
tweets2 = {}
for num in range(len(tweets)-1):
    if num < len(tweets)//2:
        tweets1[str(num)] = tweets[str(num)]
    else:
        tweets2[str(num)] = tweets[str(num)]

In [38]:
len(tweets2)

37783

In [39]:
coll.insert_many([tweets1])

<pymongo.results.InsertManyResult at 0x1ef3f8df9d0>

In [40]:
tweets1['0']

{'data': {'edit_history_tweet_ids': ['1590047656314691584'],
  'id': '1590047656314691584',
  'text': 'RT @CologeroGetz: Arizona! Here are some important names to remember and pictures to share on your social media today!\n\nAbe Hamadeh for Att…'},
 'matching_rules': [{'id': '1590047576933687297', 'tag': 'election'}]}

In [41]:
coll = db["MidtermTweets2"]

In [42]:
res = coll.insert_many([tweets2])

In [43]:
res

<pymongo.results.InsertManyResult at 0x1ef3f90c1f0>

In [44]:
res = coll.find()
print(res[0])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [45]:
res

<pymongo.cursor.Cursor at 0x1ef4260d8e0>

In [46]:
res[0]

{'_id': ObjectId('6384b288a027f26e0334ff1c'),
 '37783': {'data': {'edit_history_tweet_ids': ['1590088655481188352'],
   'id': '1590088655481188352',
   'text': "RT @AmandaMarcotte: The Elon Musk Twitter takeover is, in theory, a sideshow to the real story: American democracy in peril. But it's so pr…"},
  'matching_rules': [{'id': '1590047576933687297', 'tag': 'election'}]},
 '37784': {'data': {'edit_history_tweet_ids': ['1590088655628283906'],
   'id': '1590088655628283906',
   'text': '“With regards to these latest accusations of election interference, #Moscow is understandably losing its patience. It requires either a certain lack of self-awareness, or an astonishing excess of arrogance, for the #UnitedStates to lecture any country on the question of meddling”'},
  'matching_rules': [{'id': '1590047576933687297', 'tag': 'election'}]},
 '37785': {'data': {'edit_history_tweet_ids': ['1590088657381523461'],
   'id': '1590088657381523461',
   'text': 'RT @AmoneyResists: 🚨 BREAKING: Chie