In [1]:
#pip install requests

In [2]:
# Import all necessary package
import requests
from pymongo import MongoClient

In [3]:
# Initialize mongo client
client = MongoClient("mongodb://user:password@localhost:27017/?authSource=admin")

In [4]:
# Create local "nobel" database on the fly
db = client["nobel"]

In [5]:
for collection_name in ["nobelPrizes", "laureates"]:
    response = requests.get("https://api.nobelprize.org/2.1/{}".format(collection_name))

    documents = response.json()[collection_name]

    #print(documents)

    db[collection_name].insert_many(documents)
    

In [6]:
# Save a list of names of the databases managed by client
db_names = client.list_database_names()
print(db_names)

# Save a list of names of the collections managed by the "nobel" database
nobel_coll_names = client.nobel.list_collection_names()
print(nobel_coll_names)

['admin', 'config', 'local', 'nobel']
['nobelPrizes', 'laureates']


In [7]:
# Using collections created before
prizes_collections = db["nobelPrizes"]

laureates_collections = db["laureates"]

In [8]:
# Count documents

filter = {}

n_prizes = prizes_collections.count_documents(filter)

n_lauteares = laureates_collections.count_documents(filter)

In [9]:
print(f"count prizes: {n_prizes}")
print(f"count lauteares: {n_lauteares}")

count prizes: 50
count lauteares: 50


In [10]:
# fetch one documents to inspect
doc_prizes = prizes_collections.find_one(filter)

doc_prizes

{'_id': ObjectId('6660ccfd011df660366373b9'),
 'awardYear': '1901',
 'category': {'en': 'Chemistry', 'no': 'Kjemi', 'se': 'Kemi'},
 'categoryFullName': {'en': 'The Nobel Prize in Chemistry',
  'no': 'Nobelprisen i kjemi',
  'se': 'Nobelpriset i kemi'},
 'dateAwarded': '1901-11-12',
 'prizeAmount': 150782,
 'prizeAmountAdjusted': 10531894,
 'links': [{'rel': 'nobelPrize',
   'href': 'https://api.nobelprize.org/2/nobelPrize/che/1901',
   'action': 'GET',
   'types': 'application/json'}],
 'laureates': [{'id': '160',
   'knownName': {'en': "Jacobus H. van 't Hoff"},
   'fullName': {'en': "Jacobus Henricus van 't Hoff"},
   'portion': '1',
   'sortOrder': '1',
   'motivation': {'en': 'in recognition of the extraordinary services he has rendered by the discovery of the laws of chemical dynamics and osmotic pressure in solutions',
    'se': 'såsom ett erkännande av den utomordentliga förtjänst han inlagt genom upptäckten av lagarna för den kemiska dynamiken och för det osmotiska trycket i lö

In [11]:
doc_laureates = laureates_collections.find_one(filter)

doc_laureates

{'_id': ObjectId('6660ccff011df660366373d2'),
 'id': '745',
 'knownName': {'en': 'A. Michael Spence', 'se': 'A. Michael Spence'},
 'givenName': {'en': 'A. Michael', 'se': 'A. Michael'},
 'familyName': {'en': 'Spence', 'se': 'Spence'},
 'fullName': {'en': 'A. Michael Spence', 'se': 'A. Michael Spence'},
 'fileName': 'spence',
 'gender': 'male',
 'birth': {'date': '1943-00-00',
  'place': {'city': {'en': 'Montclair, NJ',
    'no': 'Montclair, NJ',
    'se': 'Montclair, NJ'},
   'country': {'en': 'USA', 'no': 'USA', 'se': 'USA'},
   'cityNow': {'en': 'Montclair, NJ',
    'no': 'Montclair, NJ',
    'se': 'Montclair, NJ',
    'sameAs': ['https://www.wikidata.org/wiki/Q678437',
     'https://www.wikipedia.org/wiki/Montclair,_New_Jersey'],
    'latitude': '40.825930',
    'longitude': '-74.209030'},
   'countryNow': {'en': 'USA',
    'no': 'USA',
    'se': 'USA',
    'sameAs': ['https://www.wikidata.org/wiki/Q30'],
    'latitude': '39.828175',
    'longitude': '-98.579500'},
   'continent': {

In [12]:
# Get the fields present in each type of document
prize_fields = list(prizes_collections.find_one(filter).keys())
laureate_fields = list(laureates_collections.find_one({}).keys())

print(prize_fields)
print(laureate_fields)

['_id', 'awardYear', 'category', 'categoryFullName', 'dateAwarded', 'prizeAmount', 'prizeAmountAdjusted', 'links', 'laureates']
['_id', 'id', 'knownName', 'givenName', 'familyName', 'fullName', 'fileName', 'gender', 'birth', 'wikipedia', 'wikidata', 'sameAs', 'links', 'nobelPrizes']


In [13]:
# Finding documents
laureates_collections.count_documents({"gender": "male"})

48

In [14]:
# Create a filter for laureates who died in the USA
criteria = {'id': "745"}

# Save the count of these laureates
count = laureates_collections.count_documents(criteria)
print(count)

2


In [15]:
criteria_country = {"birth.place.country.en": ("USA")}

laureates_collections.count_documents(criteria_country)

8

In [16]:
# Save a filter for laureates born in the USA, Canada, Mexico or British Protectorate of Palestine
criteria = { "birth.place.country.en": 
                { "$in": ["USA", "Canada", "Mexico", "British Protectorate of Palestine"]}
             }

# Count them and save the count
count = db.laureates.count_documents(criteria)
print(count)

10


In [17]:
# Save a filter for laureates who died in the USA and were not born there
criteria_exists = { "birth.place.country.en": "USA",
               "bornCountry": { "$ne": "USA"}, 
             }

# Count them
count = db.laureates.count_documents(criteria_exists)
print(count)

8


In [18]:
# Save a filter for laureates who exists date.place false
criteria_exists_false = {"date.place": {"$exists": False}}

# Count them
db.laureates.count_documents(criteria_exists_false)


50

In [19]:
# Save a filter for laureates who exists date.place true
criteria_exists_true = {"date.keys": {"$exists": True}}

# Count them
db.laureates.find_one(criteria_exists_true)


In [20]:
# Save a filter for laureates who exists date.place true
criteria_exists_true = {"death": {"$exists": False}}

# Count them
db.laureates.find_one(criteria_exists_true)


{'_id': ObjectId('6660ccff011df660366373d2'),
 'id': '745',
 'knownName': {'en': 'A. Michael Spence', 'se': 'A. Michael Spence'},
 'givenName': {'en': 'A. Michael', 'se': 'A. Michael'},
 'familyName': {'en': 'Spence', 'se': 'Spence'},
 'fullName': {'en': 'A. Michael Spence', 'se': 'A. Michael Spence'},
 'fileName': 'spence',
 'gender': 'male',
 'birth': {'date': '1943-00-00',
  'place': {'city': {'en': 'Montclair, NJ',
    'no': 'Montclair, NJ',
    'se': 'Montclair, NJ'},
   'country': {'en': 'USA', 'no': 'USA', 'se': 'USA'},
   'cityNow': {'en': 'Montclair, NJ',
    'no': 'Montclair, NJ',
    'se': 'Montclair, NJ',
    'sameAs': ['https://www.wikidata.org/wiki/Q678437',
     'https://www.wikipedia.org/wiki/Montclair,_New_Jersey'],
    'latitude': '40.825930',
    'longitude': '-74.209030'},
   'countryNow': {'en': 'USA',
    'no': 'USA',
    'se': 'USA',
    'sameAs': ['https://www.wikidata.org/wiki/Q30'],
    'latitude': '39.828175',
    'longitude': '-98.579500'},
   'continent': {

In [21]:
# Save a filter for laureates who exists date.place true
criteria_exists_true = {"nobelPrizes.links.2": {"$exists": True}}

# Count them
db.laureates.count_documents(criteria_exists_true)


50

In [22]:
# Save a filter for laureates using distinct
criteria_distinct = "gender"

# Count them
db.laureates.distinct(criteria_distinct)


['female', 'male']

In [23]:
# distinct with dot notation
# Count them
db.laureates.distinct("nobelPrizes.prizeStatus")


['received']

In [24]:
# Countries recorded as countries of death but not as countries of birth
countries = set(db.laureates.distinct("diedCountry")) - set(db.laureates.distinct("bornCountry"))
print(countries)

set()


In [25]:
# The number of distinct countries of laureate affiliation for prizes
count = len(db.laureates.distinct("prizes.affiliations.country"))
print(count)

0


In [26]:
# The number of distinct countries of laureate affiliation for prizes
count = len(db.laureates.distinct("nobelPrizes.sortOrder"))
print(count)

3


In [27]:
# find documents and disitnct documents returning in a list
print(list(db.laureates.find({"nobelPrizes.sortOrder": "3"})))
print("--------")
print(db.laureates.distinct("nobelPrizes", {"nobelPrizes.sortOrder": "3"}))
print("--------")
print(db.laureates.distinct("nobelPrizes.sortOrder"))

[{'_id': ObjectId('6660ccff011df660366373da'), 'id': '843', 'knownName': {'en': 'Ada E. Yonath', 'se': 'Ada E. Yonath'}, 'givenName': {'en': 'Ada E.', 'se': 'Ada E.'}, 'familyName': {'en': 'Yonath', 'se': 'Yonath'}, 'fullName': {'en': 'Ada E. Yonath', 'se': 'Ada E. Yonath'}, 'fileName': 'yonath', 'gender': 'female', 'birth': {'date': '1939-06-22', 'place': {'city': {'en': 'Jerusalem', 'no': 'Jerusalem', 'se': 'Jerusalem'}, 'country': {'en': 'British Mandate of Palestine', 'no': 'Palestinamandatet', 'se': 'Brittiska Palestinamandatet'}, 'cityNow': {'en': 'Jerusalem', 'no': 'Jerusalem', 'se': 'Jerusalem', 'sameAs': ['https://www.wikidata.org/wiki/Q1218', 'https://www.wikipedia.org/wiki/Jerusalem'], 'latitude': '31.767983', 'longitude': '35.213809'}, 'countryNow': {'en': 'Israel', 'no': 'Israel', 'se': 'Israel', 'sameAs': ['https://www.wikidata.org/wiki/Q801'], 'latitude': '31.000000', 'longitude': '35.000000'}, 'continent': {'en': 'Asia', 'no': 'Asia', 'se': 'Asien'}, 'locationString': {

In [28]:
# In which countries have USA-born laureates had affiliations for their prizes?

# First solution
print(db.laureates.distinct('nobelPrizes.affiliations.country.en', {'date.place.country.en': 'USA'}))
print("-----------------------")
print("-----------------------")

# Second solution
# Aggregation Pipeline
pipeline = [
    {
        # Filter to select only laureates born in the USA
        "$match": {
            "bornCountry": "USA"
        }
    },
    {
        # Unwind the prizes array to process each prize affiliation individually
        "$unwind": "$prizes"
    },
    {
        # Unwind the affiliations array within each prize
        "$unwind": "$prizes.affiliations"
    },
    {
        # Project the desired fields (country of affiliations)
        "$project": {
            "affiliationCountry": "$prizes.affiliations.country"
        }
    },
    {
        # Group by affiliation country to get unique countries
        "$group": {
            "_id": "$affiliationCountry",
            "count": {"$sum": 1}  # Count of laureates with affiliations in each country
        }
    },
    {
        # Optionally, sort by count if needed
        "$sort": {
            "count": -1
        }
    }
]

# Execute the aggregation pipeline
list(db.laureates.aggregate(pipeline))

[]
-----------------------
-----------------------


[]

In [29]:
# Triple plays (mostly) all around
# Prizes can be shared, even by more than two laureates. 
#In fact, all prize categories but one – literature – have had prizes shared by three or more laureates.

# Save a filter for prize documents with three or more laureates
criteria = {"laureates.2": {"$exists": True}}

# Save the set of distinct prize categories in documents satisfying the criteria
triple_play_categories = set(db.prizes.distinct("category", criteria))
assert set(db.prizes.distinct("category")) - triple_play_categories == {"literature"}

AssertionError: 

In [36]:
# Matching arrays fields and opeartors

print(len(list(db.laureates.find({"nobelPrizes.category.en": "Physics"}))))

print(len(list(db.laureates.find({"nobelPrizes.category.en":{"$ne": "Physics"}}))))

print(db.laureates.count_documents({"nobelPrizes.category.en": {"$in":["Physics","Chemistry","Medicine"]}}))

print(db.laureates.count_documents({"nobelPrizes.category.en": {"$nin":["Physics","Chemistry","Medicine"]}}))

10
40
32
18


In [47]:
# Enter $elementMatch

print(db.laureates.count_documents({"nobelPrizes":{"$elemMatch":{"category.en": "Physics"}}}))
print("======================================================")
print(list(db.laureates.find({"nobelPrizes":{"$elemMatch":{"category.en": "Physics", "sortOrder":"3"}}})))
print("======================================================")
print(len(list(db.laureates.find({"nobelPrizes":{"$elemMatch":{"category.en": "Physics", "sortOrder":"3"}}}))))
print("======================================================")

10
[{'_id': ObjectId('6660ccff011df660366373db'), 'id': '866', 'knownName': {'en': 'Adam G. Riess', 'se': 'Adam G. Riess'}, 'givenName': {'en': 'Adam G.', 'se': 'Adam G.'}, 'familyName': {'en': 'Riess', 'se': 'Riess'}, 'fullName': {'en': 'Adam G. Riess', 'se': 'Adam G. Riess'}, 'fileName': 'riess', 'gender': 'male', 'birth': {'date': '1969-12-16', 'place': {'city': {'en': 'Washington, D.C.', 'no': 'Washington, D.C.', 'se': 'Washington, D.C.'}, 'country': {'en': 'USA', 'no': 'USA', 'se': 'USA'}, 'cityNow': {'en': 'Washington, D.C.', 'no': 'Washington, D.C.', 'se': 'Washington, D.C.', 'sameAs': ['https://www.wikidata.org/wiki/Q61', 'https://www.wikipedia.org/wiki/Washington,_D.C.'], 'latitude': '38.899065', 'longitude': '-77.036523'}, 'countryNow': {'en': 'USA', 'no': 'USA', 'se': 'USA', 'sameAs': ['https://www.wikidata.org/wiki/Q30'], 'latitude': '39.828175', 'longitude': '-98.579500'}, 'continent': {'en': 'North America', 'no': 'Nord-Amerika', 'se': 'Nordamerika'}, 'locationString': {'

In [None]:
# Another query

db.laureates.count_documents({
    "prizes": {"$elemMatch": {
        "category": "physics",
        "share": {"$ne": "1"},
        "year": {"$lt": "1945"}}}})

In [None]:
# Query to calculte the ratio

# Save a filter for laureates with unshared prizes
unshared = {
    "prizes": {"$elemMatch": {
        "category": {"$nin": ["physics", "chemistry", "medicine"]},
        "share": "1",
        "year": {"$gte": "1945"},
    }}}

# Save a filter for laureates with shared prizes
shared = {
    "prizes": {"$elemMatch": {
        "category": {"$nin": ["physics", "chemistry", "medicine"]},
        "share": {"$ne": "1"},
        "year": {"$gte": "1945"},
    }}}

ratio = db.laureates.count_documents(unshared) / db.laureates.count_documents(shared)
print(ratio)

In [None]:
# Save a filter for organization laureates with prizes won before 1945
before = {
    "gender": "org",
    "prizes.year": {"$lt": "1945"},
    }

# Save a filter for organization laureates with prizes won in or after 1945
in_or_after = {
    "gender": "org",
    "prizes.year": {"$gte": "1945"},
    }

n_before = db.laureates.count_documents(before)
n_in_or_after = db.laureates.count_documents(in_or_after)
ratio = n_in_or_after / (n_in_or_after + n_before)
print(ratio)

In [60]:
# Distinct  filtering with regular expressions

print(db.laureates.find_one({"givenName.en":"Adam G."}))
print("========================================================")
print(db.laureates.distinct("birth.place.country.en", {"birth.place.country.en":{"$regex":"A"}}))
print("========================================================")

{'_id': ObjectId('6660ccff011df660366373db'), 'id': '866', 'knownName': {'en': 'Adam G. Riess', 'se': 'Adam G. Riess'}, 'givenName': {'en': 'Adam G.', 'se': 'Adam G.'}, 'familyName': {'en': 'Riess', 'se': 'Riess'}, 'fullName': {'en': 'Adam G. Riess', 'se': 'Adam G. Riess'}, 'fileName': 'riess', 'gender': 'male', 'birth': {'date': '1969-12-16', 'place': {'city': {'en': 'Washington, D.C.', 'no': 'Washington, D.C.', 'se': 'Washington, D.C.'}, 'country': {'en': 'USA', 'no': 'USA', 'se': 'USA'}, 'cityNow': {'en': 'Washington, D.C.', 'no': 'Washington, D.C.', 'se': 'Washington, D.C.', 'sameAs': ['https://www.wikidata.org/wiki/Q61', 'https://www.wikipedia.org/wiki/Washington,_D.C.'], 'latitude': '38.899065', 'longitude': '-77.036523'}, 'countryNow': {'en': 'USA', 'no': 'USA', 'se': 'USA', 'sameAs': ['https://www.wikidata.org/wiki/Q30'], 'latitude': '39.828175', 'longitude': '-98.579500'}, 'continent': {'en': 'North America', 'no': 'Nord-Amerika', 'se': 'Nordamerika'}, 'locationString': {'en':

In [77]:
# Begining and ending (and scaping)
from bson.regex import Regex

print(db.laureates.distinct("birth.place.country.en", {"birth.place.country.en": Regex("^Ar")}))
print("========================================================")
print(db.laureates.distinct("birth.place.country.en", {"birth.place.country.en": Regex("^Br")}))
print("========================================================")
print(db.laureates.distinct("birth.place.country.en", {"birth.place.country.en": Regex("^N")}))
print("========================================================")
print(db.laureates.distinct("birth.place.country.en", {"birth.place.country.en":{"$regex":"A"}}))
print("========================================================")

#Example scaping 
print(db.laureates.distinct("birth.place.country.en", {"birth.place.country.en":{"$regex":"Argentina \(some"}}))
print("========================================================")
print(db.laureates.distinct("birth.place.country.en", {"birth.place.country.en":{"$regex":"some Argentina\)$"}})) # the dollar sign match the end

['Argentina']
['British Mandate of Palestine', 'British Protectorate of Palestine']
['New Zealand']
['Argentina', 'French Algeria', 'USA']


In [None]:
# Finding names and surname init with G and S
db.laureates.count_documents({"firstname": Regex("^G"), "surname": Regex("^S")})

# Finding Germany in born country
from bson.regex import Regex

# Filter for laureates with "Germany" in their "bornCountry" value
criteria = {"bornCountry": Regex("Germany")}
print(set(db.laureates.distinct("bornCountry", criteria)))

# Filter for laureates with a "bornCountry" value starting with "Germany"
criteria = {"bornCountry": Regex("^Germany")}
print(set(db.laureates.distinct("bornCountry", criteria)))

# Fill in a string value to be sandwiched between the strings "^Germany " and "now"
criteria = {"bornCountry": Regex("^Germany " + "\(" + "now")}
print(set(db.laureates.distinct("bornCountry", criteria)))

#Filter for currently-Germany countries of birth. Fill in a string value to be sandwiched between the strings "now" and "$"
criteria = {"bornCountry": Regex("now" + " Germany\\)" + "$")}
print(set(db.laureates.distinct("bornCountry", criteria)))

# Save a filter for laureates with prize motivation values containing "transistor" as a substring
criteria = {"prizes.motivation": Regex("transistor")}

# Save the field names corresponding to a laureate's first name and last name
first, last = "firstname", "surname"
print([(laureate[first], laureate[last]) for laureate in db.laureates.find(criteria)])

# Projection Getting only what you need

In [86]:
# include only nobelPrizes.affiliations
# Exclude _id
# When projection is a json omit the second field in the result, only show the first field key

docs = db.laureates.find(filter={}, projection={"nobelPrizes.affiliations": 1, "_id": 0})
type(docs)

# convert to a list and slice
list(docs)[:3]

[{'nobelPrizes': [{'affiliations': [{'name': {'en': 'Stanford University',
       'no': 'Stanford University',
       'se': 'Stanford University'},
      'nameNow': {'en': 'Stanford University'},
      'city': {'en': 'Stanford, CA',
       'no': 'Stanford, CA',
       'se': 'Stanford, CA'},
      'country': {'en': 'USA', 'no': 'USA', 'se': 'USA'},
      'cityNow': {'en': 'Stanford, CA',
       'no': 'Stanford, CA',
       'se': 'Stanford, CA',
       'sameAs': ['https://www.wikidata.org/wiki/Q173813',
        'https://www.wikipedia.org/wiki/Stanford,_California'],
       'latitude': '37.424734',
       'longitude': '-122.163858'},
      'countryNow': {'en': 'USA',
       'no': 'USA',
       'se': 'USA',
       'sameAs': ['https://www.wikidata.org/wiki/Q30'],
       'latitude': '39.828175',
       'longitude': '-98.579500'},
      'continent': {'en': 'North America'},
      'locationString': {'en': 'Stanford, CA, USA',
       'no': 'Stanford, CA, USA',
       'se': 'Stanford, CA, USA'}}

In [88]:
# use gender:org to select organizations
# organizations have no bornCountry
# When projection is a list not omit fields in the result, only projected fields that exist are returned

docs_gender = db.laureates.find(filter={"gender":"female"}, projection=["birth.place.country.en","fullName.en"])

list(docs_gender)

[{'_id': ObjectId('6660ccff011df660366373da'),
  'fullName': {'en': 'Ada E. Yonath'},
  'birth': {'place': {'country': {'en': 'British Mandate of Palestine'}}}},
 {'_id': ObjectId('66635a81c2bb6907cee46cc6'),
  'fullName': {'en': 'Ada E. Yonath'},
  'birth': {'place': {'country': {'en': 'British Mandate of Palestine'}}}}]

In [89]:
# Simple aggregation

n_pnobal_prizes = 0
for doc in db.laureates.find({},["nobelPrizes"]):
    n_pnobal_prizes += len(doc["nobelPrizes"])

print(n_pnobal_prizes)

50


In [90]:
sum([len(doc["nobelPrizes"]) for doc in db.laureates.find({},["nobelPrizes"])])

50

In [None]:
# Querys using diofferent projections


db.laureates.find_one({"prizes": {"$elemMatch": {"category": "physics", "year": "1903"}}},["firstname","surname", "prizes"])

db.laureates.find_one({"prizes": {"$elemMatch": {"category": "physics", "year": "1903"}}},["firstname","surname", "prizes.share"])

# This option is the best to select the fullname  and prizes share info excluding the _id
db.laureates.find_one({"prizes": {"$elemMatch": {"category": "physics", "year": "1903"}}},{"firstname":1,"surname":1,"prizes.share":1,"_id":0})

In [None]:
# Find laureates whose first name starts with "G" and last name starts with "S"
docs = db.laureates.find(
       filter= {"firstname" : {"$regex" : "^G"},
                  "surname" : {"$regex" : "^S"}  })
# Print the first document 
print(docs[0])

# Use projection to select only firstname and surname
docs = db.laureates.find(
        filter= {"firstname" : {"$regex" : "^G"},
                 "surname" : {"$regex" : "^S"}  },
	projection= ["firstname", "surname"]  )

# Iterate over docs and concatenate first name and surname
full_names = [doc["firstname"] + " " + doc["surname"]  for doc in docs]

# Print the full names
print(full_names)

# Save documents, projecting out laureates share
prizes = db.prizes.find({}, ["laureates.share"])

# Iterate over prizes
for prize in prizes:
    # Initialize total share
    total_share = 0
    
    # Iterate over laureates for the prize
    for laureate in prize["laureates"]:
        # add the share of the laureate to total_share
        total_share += 1 / float(laureate["share"])
        
    # Print the total share    
    print(total_share)   

# Sorting post-query with python

In [94]:
docs = list(db.nobelPrizes.find({"category.en":"Physics"},["awardYear"]))

print([doc["awardYear"] for doc in docs][:5])

['1901', '1902', '1903', '1904', '1905']


In [97]:
# USing itemgetter to get the keys
from operator import itemgetter

# Sort list by avwardYears
docs = sorted(docs, key=itemgetter("awardYear"))
print([doc["awardYear"] for doc in docs][:5])

print("=====================================================")

# Sort list reverse by avwardYears
docs = sorted(docs, key=itemgetter("awardYear"), reverse=True)
print([doc["awardYear"] for doc in docs][:5])


['1901', '1901', '1902', '1902', '1903']
['1905', '1905', '1904', '1904', '1903']


In [101]:

cursor = db.nobelPrizes.find({"category.en":"Physics"},["awardYear"], sort=[("awardYear", 1)])
print([doc["awardYear"] for doc in cursor][:5])

print("=====================================================")

cursor_reverse = db.nobelPrizes.find({"category.en":"Physics"},["awardYear"], sort=[("awardYear", -1)])
print([doc["awardYear"] for doc in cursor_reverse][:5])

['1901', '1901', '1902', '1902', '1903']
['1905', '1905', '1904', '1904', '1903']


In [105]:
# find max awardYear
list(db.nobelPrizes.find({},{"awardYear":1,"_id":0}))

[{'awardYear': '1901'},
 {'awardYear': '1901'},
 {'awardYear': '1901'},
 {'awardYear': '1901'},
 {'awardYear': '1901'},
 {'awardYear': '1902'},
 {'awardYear': '1902'},
 {'awardYear': '1902'},
 {'awardYear': '1902'},
 {'awardYear': '1902'},
 {'awardYear': '1903'},
 {'awardYear': '1903'},
 {'awardYear': '1903'},
 {'awardYear': '1903'},
 {'awardYear': '1903'},
 {'awardYear': '1904'},
 {'awardYear': '1904'},
 {'awardYear': '1904'},
 {'awardYear': '1904'},
 {'awardYear': '1904'},
 {'awardYear': '1905'},
 {'awardYear': '1905'},
 {'awardYear': '1905'},
 {'awardYear': '1905'},
 {'awardYear': '1905'},
 {'awardYear': '1901'},
 {'awardYear': '1901'},
 {'awardYear': '1901'},
 {'awardYear': '1901'},
 {'awardYear': '1901'},
 {'awardYear': '1902'},
 {'awardYear': '1902'},
 {'awardYear': '1902'},
 {'awardYear': '1902'},
 {'awardYear': '1902'},
 {'awardYear': '1903'},
 {'awardYear': '1903'},
 {'awardYear': '1903'},
 {'awardYear': '1903'},
 {'awardYear': '1903'},
 {'awardYear': '1904'},
 {'awardYear': '

In [124]:
# Primary and secondary sorting, ordering ("awardYear", 1),("category.en", -1) 1asc, -1 desc

for doc in db.nobelPrizes.find(
    {"awardYear":{"$gt":"1903","$lt":"1905"}},
    ["category.en","awardYear"],
    sort=[("awardYear", 1),("category.en", -1)]):
    print(f"{doc.get('awardYear')} {doc.get('category').get('en')}")
    #print("{awardYear} {category}".format(doc.get('awardYear'),doc.get('category').get('en')))

1904 Physiology or Medicine
1904 Physiology or Medicine
1904 Physics
1904 Physics
1904 Peace
1904 Peace
1904 Literature
1904 Literature
1904 Chemistry
1904 Chemistry


In [129]:
# This block prints out the first five projections of a sorted query, using nobelPrizes.awardYear and birth.date

docs = list(db.laureates.find(
    {"birth.date": {"$gte": "1900"}, "nobelPrizes.awardYear": {"$gte": "1954"}},
    {"birth.date": 1, "nobelPrizes.awardYear": 1, "_id": 0},
    sort=[("nobelPrizes.awardYear", 1), ("birth.date", -1)]))
for doc in docs[:5]:
    print(doc)

{'birth': {'date': '1913-11-07'}, 'nobelPrizes': [{'awardYear': '1957'}]}
{'birth': {'date': '1913-11-07'}, 'nobelPrizes': [{'awardYear': '1957'}]}
{'birth': {'date': '1914-02-05'}, 'nobelPrizes': [{'awardYear': '1963'}]}
{'birth': {'date': '1914-02-05'}, 'nobelPrizes': [{'awardYear': '1963'}]}
{'birth': {'date': '1922-06-19'}, 'nobelPrizes': [{'awardYear': '1975'}]}


In [150]:
for doc in list(db.nobelPrizes.find(
           filter= {"category.en": "Physics"}, 
           projection= ["awardYear", "laureates.fullName.en"],
           sort= [("awardYear", 1)])):
    print(doc)

{'_id': ObjectId('6660ccfd011df660366373bc'), 'awardYear': '1901', 'laureates': [{'fullName': {'en': 'Wilhelm Conrad Röntgen'}}]}
{'_id': ObjectId('66635a80c2bb6907cee46ca8'), 'awardYear': '1901', 'laureates': [{'fullName': {'en': 'Wilhelm Conrad Röntgen'}}]}
{'_id': ObjectId('6660ccfd011df660366373c1'), 'awardYear': '1902', 'laureates': [{'fullName': {'en': 'Hendrik Antoon Lorentz'}}, {'fullName': {'en': 'Pieter Zeeman'}}]}
{'_id': ObjectId('66635a80c2bb6907cee46cad'), 'awardYear': '1902', 'laureates': [{'fullName': {'en': 'Hendrik Antoon Lorentz'}}, {'fullName': {'en': 'Pieter Zeeman'}}]}
{'_id': ObjectId('6660ccfd011df660366373c6'), 'awardYear': '1903', 'laureates': [{'fullName': {'en': 'Antoine Henri Becquerel'}}, {'fullName': {'en': 'Pierre Curie'}}, {'fullName': {'en': 'Marie Curie, née Skłodowska'}}]}
{'_id': ObjectId('66635a80c2bb6907cee46cb2'), 'awardYear': '1903', 'laureates': [{'fullName': {'en': 'Antoine Henri Becquerel'}}, {'fullName': {'en': 'Pierre Curie'}}, {'fullName':

In [169]:
docs = db.nobelPrizes.find(
           filter= {"category.en": "Physics"}, 
           projection= ["awardYear", "laureates.fullName.en"],
           sort= [("awardYear", 1)])

#list(docs)
for doc in docs:
    sorted_l = sorted(doc["laureates"], key=lambda laureate: laureate["fullName"]["en"])

    print(sorted_l)

[{'fullName': {'en': 'Wilhelm Conrad Röntgen'}}]
[{'fullName': {'en': 'Wilhelm Conrad Röntgen'}}]
[{'fullName': {'en': 'Hendrik Antoon Lorentz'}}, {'fullName': {'en': 'Pieter Zeeman'}}]
[{'fullName': {'en': 'Hendrik Antoon Lorentz'}}, {'fullName': {'en': 'Pieter Zeeman'}}]
[{'fullName': {'en': 'Antoine Henri Becquerel'}}, {'fullName': {'en': 'Marie Curie, née Skłodowska'}}, {'fullName': {'en': 'Pierre Curie'}}]
[{'fullName': {'en': 'Antoine Henri Becquerel'}}, {'fullName': {'en': 'Marie Curie, née Skłodowska'}}, {'fullName': {'en': 'Pierre Curie'}}]
[{'fullName': {'en': 'Lord Rayleigh (John William Strutt)'}}]
[{'fullName': {'en': 'Lord Rayleigh (John William Strutt)'}}]
[{'fullName': {'en': 'Philipp Eduard Anton von Lenard'}}]
[{'fullName': {'en': 'Philipp Eduard Anton von Lenard'}}]


In [170]:
# explore the prizes in the physics category.
#You will use Python to sort laureates for one prize by last name, and then MongoDB to sort prizes by year

def all_laureates(prize):  
  # sort the laureates by surname
  # sorted_laureates = sorted(prize["laureates"], key=itemgetter("fullName"))
  sorted_laureates = sorted(prize["laureates"], key=lambda laureate: laureate["fullName"]["en"])
  
  # extract surnames
  surnames = [laureate["fullName"]["en"] for laureate in sorted_laureates]
  
  # concatenate surnames separated with " and " 
  all_names = " and ".join(surnames)
  
  return all_names

# find physics prizes, project year and first and last name, and sort by year
docs = db.nobelPrizes.find(
           filter= {"category.en": "Physics"}, 
           projection= ["awardYear", "laureates.fullName.en"],
           sort= [("awardYear", 1)])

# print the year and laureate names (from all_laureates)
for doc in docs:
  print("{year}: {names}".format(year=doc["awardYear"], names=all_laureates(doc)))

1901: Wilhelm Conrad Röntgen
1901: Wilhelm Conrad Röntgen
1902: Hendrik Antoon Lorentz and Pieter Zeeman
1902: Hendrik Antoon Lorentz and Pieter Zeeman
1903: Antoine Henri Becquerel and Marie Curie, née Skłodowska and Pierre Curie
1903: Antoine Henri Becquerel and Marie Curie, née Skłodowska and Pierre Curie
1904: Lord Rayleigh (John William Strutt)
1904: Lord Rayleigh (John William Strutt)
1905: Philipp Eduard Anton von Lenard
1905: Philipp Eduard Anton von Lenard


In [132]:
# Finding categories by awardYear

# original categories from 1901
original_categories = db.nobelPrizes.distinct("category.en", {"awardYear": "1901"})
print(original_categories)

# project year and category, and sort
docs = db.nobelPrizes.find(
        filter={},
        projection = {"awardYear":1, "category.en":1, "_id":0},
        sort=[("awardYear", -1),("category", 1)]
)

#print the documents
for doc in docs:
  print(doc)

['Chemistry', 'Literature', 'Peace', 'Physics', 'Physiology or Medicine']
{'awardYear': '1905', 'category': {'en': 'Chemistry'}}
{'awardYear': '1905', 'category': {'en': 'Chemistry'}}
{'awardYear': '1905', 'category': {'en': 'Literature'}}
{'awardYear': '1905', 'category': {'en': 'Literature'}}
{'awardYear': '1905', 'category': {'en': 'Peace'}}
{'awardYear': '1905', 'category': {'en': 'Peace'}}
{'awardYear': '1905', 'category': {'en': 'Physics'}}
{'awardYear': '1905', 'category': {'en': 'Physics'}}
{'awardYear': '1905', 'category': {'en': 'Physiology or Medicine'}}
{'awardYear': '1905', 'category': {'en': 'Physiology or Medicine'}}
{'awardYear': '1904', 'category': {'en': 'Chemistry'}}
{'awardYear': '1904', 'category': {'en': 'Chemistry'}}
{'awardYear': '1904', 'category': {'en': 'Literature'}}
{'awardYear': '1904', 'category': {'en': 'Literature'}}
{'awardYear': '1904', 'category': {'en': 'Peace'}}
{'awardYear': '1904', 'category': {'en': 'Peace'}}
{'awardYear': '1904', 'category': {'

# Working with index in mongodb


In [180]:
# Gettiong idex information
db.laureates.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]}}

In [179]:
print(db.laureates.count_documents({"fullName.en": {"$regex":"Michael"}}))
print("=====================================================")
db.laureates.find({"fullName.en": {"$regex":"Michael"}}).explain()

2


{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'nobel.laureates',
  'indexFilterSet': False,
  'parsedQuery': {'fullName.en': {'$regex': 'Michael'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'fullName.en': {'$regex': 'Michael'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 2,
  'executionTimeMillis': 14,
  'totalKeysExamined': 0,
  'totalDocsExamined': 50,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'fullName.en': {'$regex': 'Michael'}},
   'nReturned': 2,
   'executionTimeMillisEstimate': 0,
   'works': 52,
   'advanced': 2,
   'needTime': 49,
   'needYield': 0,
   'saveState': 0,
   'restoreState': 0,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 50},
  'allPlansExecution': []},
 'serverInfo': {'host': '08c4c57812d5',
  'port': 27017,
  'version': '4.4.29',
  'gitVersion': 'f4dda329a99811c707eb06d05ad023599f9be263'},
 'ok': 1.0}

In this case if we need create an index for a query mongodb using distinct, we need to take acocunt the first criterial parameter is sencond possition when create an index, and the second parameter criterial is the first index, for example:

```
db.nobelPrizes.distinct("category", {"laureates.share": {"$gte":"1"}})
```

Here to create an index it will be

```
db.nobelPrizes.create_index([("laureates.share", 1), ("category", 1)])
```

In [184]:
# Specify an index model for compound sorting
index_model = [("category.en", 1), ("awardYear", -1)]
db.nobelPrizes.create_index(index_model)

# Collect the last single-laureate year for each category
report = ""
for category in sorted(db.nobelPrizes.distinct("category.en")):
    doc = db.nobelPrizes.find_one(
        {"category.en": category, "laureates.sortOrder": "1"},
        sort=[("awardYear", -1)]
    )
    report += "{category}: {awardYear}\n".format(**doc)

print(report)

{'en': 'Chemistry', 'no': 'Kjemi', 'se': 'Kemi'}: 1905
{'en': 'Literature', 'no': 'Litteratur', 'se': 'Litteratur'}: 1905
{'en': 'Peace', 'no': 'Fred', 'se': 'Fred'}: 1905
{'en': 'Physics', 'no': 'Fysikk', 'se': 'Fysik'}: 1905
{'en': 'Physiology or Medicine', 'no': 'Fysiologi eller medisin', 'se': 'Fysiologi eller medicin'}: 1905



In [186]:
# Checkoing index created
db.nobelPrizes.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]},
 'category.en_1_awardYear_-1': {'v': 2,
  'key': [('category.en', 1), ('awardYear', -1)]}}

In [187]:
from collections import Counter

# Ensure an index on country of birth
db.laureates.create_index([("birth.place.country.en", 1)])

# Collect a count of laureates for each country of birth
n_born_and_affiliated = {
    country: db.laureates.count_documents({
        "birth.place.country.en": country,
        "nobelPrizes.affiliations.country.en": country
    })
    for country in db.laureates.distinct("birth.place.country.en")
}

five_most_common = Counter(n_born_and_affiliated).most_common(5)
print(five_most_common)

[('USA', 6), ('Germany', 4), ('Japan', 4), (None, 2), ('Belgium', 2)]


# Limits and Skips with Sorts

In [232]:
# Limiting our exploration and skip for pagination

for doc in db.nobelPrizes.find({}, ["laureates.sortOrder"]):
    share_is_three = [laureate["sortOrder"] == "3" for laureate in doc["laureates"]]
    #print(share_is_three, type(share_is_three))

    assert all(share_is_three) is False or not any(share_is_three)

for doc in db.nobelPrizes.find({"laureates.sortOrder": "1"}, skip=5, limit=5):
    #print(doc)
    print("{awardYear} {category}".format(**doc))

1902 {'en': 'Chemistry', 'no': 'Kjemi', 'se': 'Kemi'}
1902 {'en': 'Literature', 'no': 'Litteratur', 'se': 'Litteratur'}
1902 {'en': 'Peace', 'no': 'Fred', 'se': 'Fred'}
1902 {'en': 'Physics', 'no': 'Fysikk', 'se': 'Fysik'}
1902 {'en': 'Physiology or Medicine', 'no': 'Fysiologi eller medisin', 'se': 'Fysiologi eller medicin'}


It's possible to use in the cursor

```
for doc in db.nobelPrizes.find({"laureates.sortOrder": "1"}).sort([("awardYear", 1)]).skip(5).limit(5):
    #print(doc)
    print("{awardYear} {category}".format(**doc))
```

In [235]:
# Using sort cluasule in different forms to order the values when has multiple condition
cursor1 = db.nobelPrizes.find({"laureates.sortOrder": "1"}).skip(5).limit(5).sort([("awardYear", 1)])

cursor2 = db.nobelPrizes.find({"laureates.sortOrder": "1"}).skip(5).limit(5).sort("awardYear", 1)

cursor3 = db.nobelPrizes.find({"laureates.sortOrder": "1"}).skip(5).limit(5).sort("awardYear")

docs_cursor = list(cursor1)

assert docs_cursor == list(cursor2) == list(cursor3)

for doc in docs_cursor:
    print("{awardYear} {category}".format(**doc))

1901 {'en': 'Literature', 'no': 'Litteratur', 'se': 'Litteratur'}
1901 {'en': 'Physiology or Medicine', 'no': 'Fysiologi eller medisin', 'se': 'Fysiologi eller medicin'}
1901 {'en': 'Physics', 'no': 'Fysikk', 'se': 'Fysik'}
1901 {'en': 'Literature', 'no': 'Litteratur', 'se': 'Litteratur'}
1901 {'en': 'Peace', 'no': 'Fred', 'se': 'Fred'}


In [236]:
from pprint import pprint

# Fetch prizes with quarter-share laureate(s)
filter_ = {"laureates.sortOrder": "1"}

# Save the list of field names
projection = ["category.en", "awardYear", "laureates.motivation.en"]

# Save a cursor to yield the first five prizes
cursor = db.nobelPrizes.find(filter_, projection).sort("awardYear").limit(5)
pprint(list(cursor))

[{'_id': ObjectId('6660ccfd011df660366373bc'),
  'awardYear': '1901',
  'category': {'en': 'Physics'},
  'laureates': [{'motivation': {'en': 'in recognition of the extraordinary '
                                      'services he has rendered by the '
                                      'discovery of the remarkable rays '
                                      'subsequently named after him'}}]},
 {'_id': ObjectId('6660ccfd011df660366373bd'),
  'awardYear': '1901',
  'category': {'en': 'Physiology or Medicine'},
  'laureates': [{'motivation': {'en': 'for his work on serum therapy, '
                                      'especially its application against '
                                      'diphtheria, by which he has opened a '
                                      'new road in the domain of medical '
                                      'science and thereby placed in the hands '
                                      'of the physician a victorious weapon '
                     

In [239]:
#Pages of particle-prized people

# Write a function to retrieve a page of data
def get_particle_laureates(page_number=1, page_size=3):
    if page_number < 1 or not isinstance(page_number, int):
        raise ValueError("Pages are natural numbers (starting from 1).")
    particle_laureates = list(
        db.laureates.find(
            {"nobelPrizes.motivation.en": {"$regex": "particle"}},
            ["fullName.en", "nobelPrizes"])
        .sort([("nobelPrizes.awardYear", 1), ("fullName", 1)])
        .skip(page_size * (page_number - 1))
        .limit(page_size))
    return particle_laureates

# Collect and save the first nine pages
pages = [get_particle_laureates(page_number=page) for page in range(1,9)]
pprint(pages[0])

[{'_id': ObjectId('66635a81c2bb6907cee46cbf'),
  'fullName': {'en': 'Aage Niels Bohr'},
  'nobelPrizes': [{'affiliations': [{'city': {'en': 'Copenhagen',
                                              'no': 'København',
                                              'se': 'Köpenhamn'},
                                     'cityNow': {'en': 'Copenhagen',
                                                 'latitude': '55.678127',
                                                 'longitude': '12.572532',
                                                 'no': 'København',
                                                 'sameAs': ['https://www.wikidata.org/wiki/Q1748',
                                                            'https://www.wikipedia.org/wiki/Copenhagen'],
                                                 'se': 'Köpenhamn'},
                                     'continent': {'en': 'Europe'},
                                     'country': {'en': 'Denmark',
                     

# Aggregation From Query Components to Aggregation Stages

In [242]:
# Querys have implicity stages without aggregation

cursor4 = db.laureates.find(filter={"birth.place.country.en": "USA"}, projection={"nobelPrizes.awardYear": 1}, limit=3)

for doc in cursor4:
    print(doc["nobelPrizes"])

[{'awardYear': '2001'}]
[{'awardYear': '2011'}]
[{'awardYear': '2007'}]


In [245]:
# Querys have aggregation

cursor4_agg = db.laureates.aggregate([{"$match":{"birth.place.country.en": "USA"}}, {"$project":{"nobelPrizes.awardYear": 1}}, {"$limit":3}])

for doc in cursor4_agg:
    print(doc["nobelPrizes"])

[{'awardYear': '2001'}]
[{'awardYear': '2011'}]
[{'awardYear': '2007'}]


In [248]:
# Adding sort and skip stages
from collections import OrderedDict

list(db.laureates.aggregate([{"$match":{"birth.place.country.en": "USA"}}, {"$project":{"nobelPrizes.awardYear": 1, "_id":0}},{"$sort": OrderedDict([("nobelPrizes.awardYear",1)])},{"$skip": 1}, {"$limit":3}]))

[{'nobelPrizes': [{'awardYear': '2000'}]},
 {'nobelPrizes': [{'awardYear': '2001'}]},
 {'nobelPrizes': [{'awardYear': '2001'}]}]

In [253]:
# Count using aggregate in the query

print(list(db.laureates.aggregate([{"$match":{"birth.place.country.en": "USA"}}, {"$count": "n_USA-born-laureates"}])))
print("=============================================")
print(db.laureates.count_documents({"birth.place.country.en": "USA"}))

[{'n_USA-born-laureates': 8}]
8


In [255]:
# Translate cursor to aggregation pipeline
pipeline = [
    {"$match": {"gender": {"$ne": "male"}}},
    {"$project": {"birth.place.country.en": 1, "nobelPrizes.affiliations.country.en": 1}},
    {"$limit": 3}
]

for doc in db.laureates.aggregate(pipeline):
    print("{birth}: {nobelPrizes}".format(**doc))

{'place': {'country': {'en': 'British Mandate of Palestine'}}}: [{'affiliations': [{'country': {'en': 'Israel'}}]}]
{'place': {'country': {'en': 'British Mandate of Palestine'}}}: [{'affiliations': [{'country': {'en': 'Israel'}}]}]


In [280]:
"""
Construct an aggregation pipeline to collect, in reverse chronological order (i.e., descending year), 
prize documents for all original categories (that is, $in categories awarded in 1901). 
Project only the prize year and category (including document _id is fine).

The aggregation cursor will be fed to Python's itertools.groupby function to group prizes by year. 
For each year that at least one of the original prize categories was missing, 
a line with all missing categories for that year will be printed
"""

from collections import OrderedDict
from itertools import groupby
from operator import itemgetter

original_categories = set(db.nobelPrizes.distinct("category.en", {"awardYear": "1901"}))

# Save an pipeline to collect original-category prizes
pipeline = [
    {"$match": {"category.en": {"$in": list(original_categories)}}},
    {"$project": {"awardYear": 1, "category.en": 1}},
    {"$sort": OrderedDict([("awardYear", -1)])}
]
cursor = db.nobelPrizes.aggregate(pipeline)
for key, group in groupby(cursor, key=itemgetter("awardYear")):
    # I need to check what happens here, because if I coment the following print I don't see it the final result
    print(f"\n{key} \n{list(group)} \n{original_categories}")
    missing = original_categories - {doc["category"]["en"] for doc in group}
    print(f"missing: {missing}")
    if missing:
        print("==================================")
        print("{awardYear}: {missing}".format(awardYear=key, missing=", ".join(sorted(missing))))


1905 
[{'_id': ObjectId('6660ccfd011df660366373cd'), 'awardYear': '1905', 'category': {'en': 'Chemistry'}}, {'_id': ObjectId('66635a80c2bb6907cee46cb9'), 'awardYear': '1905', 'category': {'en': 'Chemistry'}}, {'_id': ObjectId('6660ccfd011df660366373ce'), 'awardYear': '1905', 'category': {'en': 'Literature'}}, {'_id': ObjectId('66635a80c2bb6907cee46cba'), 'awardYear': '1905', 'category': {'en': 'Literature'}}, {'_id': ObjectId('6660ccfd011df660366373cf'), 'awardYear': '1905', 'category': {'en': 'Peace'}}, {'_id': ObjectId('66635a80c2bb6907cee46cbb'), 'awardYear': '1905', 'category': {'en': 'Peace'}}, {'_id': ObjectId('6660ccfd011df660366373d0'), 'awardYear': '1905', 'category': {'en': 'Physics'}}, {'_id': ObjectId('66635a80c2bb6907cee46cbc'), 'awardYear': '1905', 'category': {'en': 'Physics'}}, {'_id': ObjectId('6660ccfd011df660366373d1'), 'awardYear': '1905', 'category': {'en': 'Physiology or Medicine'}}, {'_id': ObjectId('66635a80c2bb6907cee46cbd'), 'awardYear': '1905', 'category': {

In [307]:
# Working with field path, using the field with prefix $

print(db.laureates.aggregate([{"$project":{"nobelPrizes": 1}}]).next())
print("====================================================")
print(db.laureates.aggregate([{"$project": {"n_prizes": {"$size": "$nobelPrizes"}}}]).next())
print("====================================================")
print(db.laureates.aggregate([{"$project": {"n_prizes": {"$size": ["$nobelPrizes"]}}}]).next())
print("====================================================")
print(db.laureates.aggregate([{"$project": {"solo_winner": {"$in": ["2", "$nobelPrizes.sortOrder"]}}}]).next())

{'_id': ObjectId('6660ccff011df660366373d2'), 'nobelPrizes': [{'awardYear': '2001', 'category': {'en': 'Economic Sciences', 'no': 'Økonomi', 'se': 'Ekonomi'}, 'categoryFullName': {'en': 'The Sveriges Riksbank Prize in Economic Sciences in Memory of Alfred Nobel', 'no': 'Sveriges Riksbanks pris i økonomisk vitenskap til minne om Alfred Nobel', 'se': 'Sveriges Riksbanks pris i ekonomisk vetenskap till Alfred Nobels minne'}, 'sortOrder': '2', 'portion': '1/3', 'dateAwarded': '2001-10-10', 'prizeStatus': 'received', 'motivation': {'en': 'for their analyses of markets with asymmetric information', 'se': 'för deras analys av marknader med assymetrisk informations'}, 'prizeAmount': 10000000, 'prizeAmountAdjusted': 15114754, 'affiliations': [{'name': {'en': 'Stanford University', 'no': 'Stanford University', 'se': 'Stanford University'}, 'nameNow': {'en': 'Stanford University'}, 'city': {'en': 'Stanford, CA', 'no': 'Stanford, CA', 'se': 'Stanford, CA'}, 'country': {'en': 'USA', 'no': 'USA', 's

In [312]:
list1 = db.laureates.distinct("birth.place.country.en")
print(list(list1))
print("====================================================")
list2 = [doc["_id"] for doc in db.laureates.aggregate([{"$group":{"_id":"$birth.place.country.en"}}])]
print(list(list2))
print("====================================================")
set(list2) - {None} == set(list1) - {None}

[None, 'Argentina', 'Belgium', 'British Mandate of Palestine', 'British Protectorate of Palestine', 'Denmark', 'Egypt', 'Ethiopia', 'France', 'French Algeria', 'Germany', 'India', 'Japan', 'Lithuania', 'New Zealand', 'Prussia', 'USA', 'United Kingdom']
[None, 'Argentina', 'Belgium', 'British Mandate of Palestine', 'British Protectorate of Palestine', 'Denmark', 'Egypt', 'Ethiopia', 'France', 'French Algeria', 'Germany', 'India', 'Japan', 'Lithuania', 'New Zealand', 'Prussia', 'USA', 'United Kingdom']


True

In [314]:
list(db.laureates.aggregate([
    {"$project":{"n_prizes": {"$size":"$nobelPrizes"}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum":"$n_prizes"}}}
]))

[{'_id': None, 'n_prizes_total': 50}]

In [324]:
list(db.nobelPrizes.aggregate([
    {"$project": {
        "allThree": {"$setEquals": ["$laureates.sortOrder", ["2"]]},
        "noneThree": {"$not": {"$setIsSubset": [["2"], "$laureates.sortOrder"]}}
    }},
    {"$match": {"$nor": [{"allThree": True}, {"noneThree": True}]}}]))

[{'_id': ObjectId('6660ccfd011df660366373bb'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('6660ccfd011df660366373c0'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('6660ccfd011df660366373c1'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('6660ccfd011df660366373c6'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('6660ccfd011df660366373c9'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('66635a80c2bb6907cee46ca7'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('66635a80c2bb6907cee46cac'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('66635a80c2bb6907cee46cad'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('66635a80c2bb6907cee46cb2'),
  'allThree': False,
  'noneThree': False},
 {'_id': ObjectId('66635a80c2bb6907cee46cb5'),
  'allThree': False,
  'noneThree': False}]

In [328]:
# Count prizes awarded (at least partly) to organizations as a sum over sizes of "prizes" arrays.
pipeline = [
    {"$match": {"gender": "male"}},
    {"$project": {"n_prizes": {"$size": "$nobelPrizes"}}},
    {"$group": {"_id": None, "n_prizes_total": {"$sum": "$n_prizes"}}}
]

print(list(db.laureates.aggregate(pipeline)))

[{'_id': None, 'n_prizes_total': 48}]


In [338]:
"""
Gap years, aggregated

In a previous exercise, you collected instances of prize categories not being awarded in particular years. 
You implemented this using a for loop in Python. You will now implement this as an aggregation pipeline that:

Filters for original prize categories (i.e. sans economics),
Projects category and year,
Groups distinct prize categories awarded by year,
Projects prize categories not awarded by year,
Filters for years with missing prize categories, and
Returns a cursor of documents in reverse chronological order, one per year, each with a list of missing prize categories for that year.

"""

from collections import OrderedDict

original_categories = sorted(set(db.nobelPrizes.distinct("category.en", {"awardYear": "1901"})))
pipeline = [
    {"$match": {"category.en": {"$in": original_categories}}},
    {"$project": {"category.en": 1, "awardYear": 1}},
    
    # Collect the set of category values for each prize year.
    {"$group": {"_id": "$awardYear", "categories": {"$addToSet": "$category"}}},
    
    # Project categories *not* awarded (i.e., that are missing this year).
    {"$project": {"missing": {"$setDifference": [original_categories, "$categories"]}}},
    
    # Only include years with at least one missing category
    {"$match": {"missing.0": {"$exists": True}}},
    
    # Sort in reverse chronological order. Note that "_id" is a distinct year at this stage.
    {"$sort": OrderedDict([("_id", -1)])},
]
for doc in db.nobelPrizes.aggregate(pipeline):
    print("{awardYear}: {missing}".format(awardYear=doc["_id"],missing=", ".join(sorted(doc["missing"]))))

1905: Chemistry, Literature, Peace, Physics, Physiology or Medicine
1904: Chemistry, Literature, Peace, Physics, Physiology or Medicine
1903: Chemistry, Literature, Peace, Physics, Physiology or Medicine
1902: Chemistry, Literature, Peace, Physics, Physiology or Medicine
1901: Chemistry, Literature, Peace, Physics, Physiology or Medicine


# Zoom into array fields

In [340]:
"""
The $unwind stage in MongoDB's aggregation framework is used to deconstruct an array field from the input documents to output a document 
for each element. Each output document is the input document with the value of the array field replaced by the element. 
This stage is particularly useful for flattening an array and then performing operations like sorting, grouping, or applying 
further filters on each element of the array.
"""

list(db.nobelPrizes.aggregate([
    {"$unwind":"$laureates"},
    {"$project":{"_id":0,"awardYear": 1, "category":1, "laureates.fullName.en": 1, "laureates.sortOrder": 1}},
    {"$limit":3}
]))

[{'awardYear': '1901',
  'category': {'en': 'Chemistry', 'no': 'Kjemi', 'se': 'Kemi'},
  'laureates': {'fullName': {'en': "Jacobus Henricus van 't Hoff"},
   'sortOrder': '1'}},
 {'awardYear': '1901',
  'category': {'en': 'Literature', 'no': 'Litteratur', 'se': 'Litteratur'},
  'laureates': {'fullName': {'en': 'Sully Prudhomme'}, 'sortOrder': '1'}},
 {'awardYear': '1901',
  'category': {'en': 'Peace', 'no': 'Fred', 'se': 'Fred'},
  'laureates': {'fullName': {'en': 'Jean Henry Dunant'}, 'sortOrder': '1'}}]

In [344]:
# Renormalization

list(db.nobelPrizes.aggregate([
    {"$unwind":"$laureates"},
    {"$project":{"awardYear": 1, "category":1, "laureates.id": 1}},
    {"$group": {"_id": {"$concat": ["$category.en", ":" ,"$awardYear"]}, 
               "laureate_ids":{"$addToSet": "$laureates.id"}}},
    {"$limit":5}
]))

[{'_id': 'Literature:1901', 'laureate_ids': ['569']},
 {'_id': 'Physiology or Medicine:1901', 'laureate_ids': ['293']},
 {'_id': 'Chemistry:1904', 'laureate_ids': ['163']},
 {'_id': 'Chemistry:1903', 'laureate_ids': ['162']},
 {'_id': 'Peace:1904', 'laureate_ids': ['467']}]

In [353]:
# Unwind and count 'em, one by one

print(list(db.nobelPrizes.aggregate([
    {"$project":{"n_laureates":{"$size": "$laureates"}, "category.en": 1}},
    {"$group": {"_id": "$category", "n_laureates": {"$sum": "$n_laureates"}}},
    {"$sort": {"n_laureates": -1}}
])))

print("==================================================")

print(list(db.nobelPrizes.aggregate([
    {"$unwind":"$laureates"},
    {"$group": {"_id": "$category.en", "n_laureates": {"$sum": 1}}},
    {"$sort": {"n_laureates": -1}}
])))

[{'_id': {'en': 'Physics'}, 'n_laureates': 16}, {'_id': {'en': 'Peace'}, 'n_laureates': 14}, {'_id': {'en': 'Literature'}, 'n_laureates': 12}, {'_id': {'en': 'Chemistry'}, 'n_laureates': 10}, {'_id': {'en': 'Physiology or Medicine'}, 'n_laureates': 10}]
[{'_id': 'Physics', 'n_laureates': 16}, {'_id': 'Peace', 'n_laureates': 14}, {'_id': 'Literature', 'n_laureates': 12}, {'_id': 'Chemistry', 'n_laureates': 10}, {'_id': 'Physiology or Medicine', 'n_laureates': 10}]


In [365]:
# Using lookp

# I need to check this query, because the nobelPrize doesn't have brith.places.contry
# The problem is the collection nobelPrizes and laureates doesn't have the same Ids in the field id and laureates.id.
# I need to download again the data to solve this problem
list(db.nobelPrizes.aggregate([
    {"$match": {"category.en": "Physics"}},
    {"$unwind": "$laureates"},
    {"$lookup": {
        "from": "laureates", 
        "foreignField": "id", 
        "localField": "laureates.id",
        "as": "laureate_bios"
        }},
    
    #{"$unwind": "$laureate_bios"},
    #{"$group":{"_id": None,
    #          "bornCountries": {
    #              "$addToSet": "$laureate_bios.birth.place.country.en"
    #          }
    #}}
]))

[{'_id': ObjectId('6660ccfd011df660366373d0'),
  'awardYear': '1905',
  'category': {'en': 'Physics', 'no': 'Fysikk', 'se': 'Fysik'},
  'categoryFullName': {'en': 'The Nobel Prize in Physics',
   'no': 'Nobelprisen i fysikk',
   'se': 'Nobelpriset i fysik'},
  'prizeAmount': 138089,
  'prizeAmountAdjusted': 9361622,
  'links': [{'rel': 'nobelPrize',
    'href': 'https://api.nobelprize.org/2/nobelPrize/phy/1905',
    'action': 'GET',
    'types': 'application/json'}],
  'laureates': {'id': '9',
   'knownName': {'en': 'Philipp Lenard'},
   'fullName': {'en': 'Philipp Eduard Anton von Lenard'},
   'portion': '1',
   'sortOrder': '1',
   'motivation': {'en': 'for his work on cathode rays',
    'se': 'för hans arbeten över katodstrålarna'},
   'links': [{'rel': 'laureate',
     'href': 'https://api.nobelprize.org/2/laureate/9',
     'action': 'GET',
     'types': 'application/json'}]},
  'laureate_bios': []},
 {'_id': ObjectId('66635a80c2bb6907cee46cbc'),
  'awardYear': '1905',
  'category'

In [366]:
bornCountries = db.laureates.distinct(
    "birth.place.country.en", {"nobelPrizes.category.en": "Physics"}
)

#assert set(bornCountries) == set(agg[0]["birth.place.country.en"])
list(bornCountries)

['Denmark', 'France', 'India', 'Prussia', 'USA']

In [367]:
"""
Here and elsewhere
What proportion of laureates won a prize while affiliated with an institution in their country of birth? 
Build an aggregation pipeline to get the count of laureates who either did or did not win a prize with an 
affiliation country that is a substring of their country of birth -- for example, the prize affiliation country "Germany" 
should match the country of birth "Prussia (now Germany)".
"""

key_ac = "nobelPrizes.affiliations.country.en"
key_bc = "birth.place.country.en"
pipeline = [
    {"$project": {key_bc: 1, key_ac: 1}},

    # Ensure a single prize affiliation country per pipeline document
    {"$unwind": "$nobelPrizes"},
    {"$unwind": "$noeblPrizes.affiliations"},

    # Ensure values in the list of distinct values (so not empty)
    {"$match": {key_ac: {"$in": db.laureates.distinct(key_ac)}}},
    {"$project": {"affilCountrySameAsBorn": {
        "$gte": [{"$indexOfBytes": ["$"+key_ac, "$"+key_bc]}, 0]}}},

    # Count by "$affilCountrySameAsBorn" value (True or False)
    {"$group": {"_id": "$affilCountrySameAsBorn",
                "count": {"$sum": 1}}},
]
for doc in db.laureates.aggregate(pipeline): print(doc)

In [None]:
pipeline = [
    # Unwind the laureates array
    {"$unwind": "$laureates"},
    {"$lookup": {
        "from": "laureates", "foreignField": "id",
        "localField": "laureates.id", "as": "laureate_bios"}},

    # Unwind the new laureate_bios array
    {"$unwind": "$laureate_bios"},
    {"$project": {"category": 1,
                  "bornCountry": "$laureate_bios.bornCountry"}},

    # Collect bornCountry values associated with each prize category
    {"$group": {"_id": "$category",
                "bornCountries": {"$addToSet": "$bornCountry"}}},

    # Project out the size of each category's (set of) bornCountries
    {"$project": {"category": 1,
                  "nBornCountries": {"$size": "$bornCountries"}}},
    {"$sort": {"nBornCountries": -1}},
]
for doc in db.prizes.aggregate(pipeline): print(doc)


# Something Extra: $addFields to Aid Analysys

In [None]:
# split and cond-itionally correct (with $concat)

# I need to find data with date values to test this query
docs_add_fields = db.laureates.aggregate([
    {"$match": {"died": {"$gt": "1700"}}, {"born": {"$gt":"1700"}}},
    {"$addFields": {"bornArray":{"$plit":["$born", "-"]},
                   "diedArray":{"$plit":["$died", "-"]}}},
    {"$addfields": {"born": {"$cond": [
        {"$in": ["00", "$bornArray"]},
        {"$concat": [{"$arrayElemAt": ["$bornArray", 0]}, "-01-01"]},
        "$born",]}}},
    {"$project": {"died": {"$dateFieldString":{"dateString":"$died"}},
                  "born": {"$dateFieldString":{"dateString":"$born"}},
                 "_id": 0}}
    ]}}
])

print(docs_add_fields[0])

In [None]:
# sUsing buckets

# I need to find data with date values to test this query
docs_add_fields_with_bucket = db.laureates.aggregate([
    {"$match": {"died": {"$gt": "1700"}}, {"born": {"$gt":"1700"}}},
    {"$addFields": {"bornArray":{"$plit":["$born", "-"]},
                   "diedArray":{"$plit":["$died", "-"]}}},
    {"$addfields": {"born": {"$cond": [
        {"$in": ["00", "$bornArray"]},
        {"$concat": [{"$arrayElemAt": ["$bornArray", 0]}, "-01-01"]},
        "$born",]}}},
    {"$project": {"died": {"$dateFieldString":{"dateString":"$died"}},
                  "born": {"$dateFieldString":{"dateString":"$born"}},
                 "_id": 0}},
    {"$project": {"awardYear": {"$floor":{"$divide": [
        {"$substract": ["$died","$born"]},
        31557600000 # 1000 * 60 * 60 * 24 *  365.25
      ]}}}},
    {"$bucket": {"groupBy":"$awardYear", "boundaries": lit(range(30, 120, 10))}}
    ]}}
])


for doc in docs_add_fields_with_bucket: print(doc)

In [None]:
"""
How many prizes were awarded to immigrants?
How many prizes were awarded to people who had no affiliation in their country of birth at the time of the award?
"""

pipeline = [
    # Limit results to people; project needed fields; unwind prizes
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
  
    # Count prizes with no country-of-birth affiliation
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"},
]


print(list(db.laureates.aggregate(pipeline)))

In [None]:
"""
Refinement: filter out "unaffiliated" people
In the previous exercise, we counted prizes awarded to people without an affiliation in their "bornCountry". 
However, hundreds of prizes were awarded to people without recorded affiliations; sure, 
their "bornCountry" is technically not the "country" of any of their affiliations, 
but there are no "country" values to compare against!
"""

pipeline = [
    {"$match": {"gender": {"$ne": "org"}}},
    {"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
    {"$unwind": "$prizes"},
    {"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
    {"$match": {"bornCountryInAffiliations": False}},
    {"$count": "awardedElsewhere"},
]

# Construct the additional filter stage
added_stage = {"$match": {"prizes.affiliations.country": {"$in": db.laureates.distinct("prizes.affiliations.country")}}}

# Insert this stage into the pipeline
pipeline.insert(3, added_stage)
print(list(db.laureates.aggregate(pipeline)))