# Data Crawling

In [1]:
from loguru import logger
import requests
from pymongo import MongoClient

In [2]:
def get_json_from_url(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def insert_data_into_collection(db, collection_name, data):
    collection = db[collection_name]
    collection.insert_many(data)
    return collection.count_documents({})

def nobel_data_crawler():
    # URLs
    data_url = "https://assets.datacamp.com/production/repositories/1838/datasets/"
    laureates_url = data_url + "f402fa7be837b9cd4890f4e1c59a7377693ba36c/laureates.json"
    prizes_url = data_url + "3fde64719bc3226b593a1c261f715566ea6284b2/prizes.json"

    # MongoDB client
    client = MongoClient()
    db = client["nobel"]

    try:
        # Laureates
        laureates_data = get_json_from_url(laureates_url)
        laureates_count = insert_data_into_collection(db, "laureates", laureates_data)
        logger.info(f"{laureates_count} documents stored in laureates collection")

        # Prizes
        prizes_data = get_json_from_url(prizes_url)
        prizes_count = insert_data_into_collection(db, "prizes", prizes_data)
        logger.info(f"{prizes_count} documents stored in prizes collection")

    except requests.exceptions.RequestException as e:
        logger.error(f"Error during data retrieval: {e}")


In [3]:
nobel_data_crawler()

[32m2023-11-16 22:35:31.903[0m | [1mINFO    [0m | [36m__main__[0m:[36mnobel_data_crawler[0m:[36m25[0m - [1m13076 documents stored in laureates collection[0m
[32m2023-11-16 22:35:32.877[0m | [1mINFO    [0m | [36m__main__[0m:[36mnobel_data_crawler[0m:[36m30[0m - [1m7670 documents stored in prizes collection[0m
