# Load required modules

In [None]:
# MongoDB
from pymongo import MongoClient
from pymongo.database import Database

from pprint import pprint

import pandas as pd

from bson.son import SON

### Connect to a the Atlas Cluster

In [None]:
# to be deleted before push!
connection_string = "PUT YOUR CONNECTION STRING HERE"

client = MongoClient(connection_string)
mw243 = Database(client, 'mw243')  # connect to the mw243 database

In [None]:
# this is a helper function that improves our query calls

def execute_query_return_df(query={}, projection=None, limit=10, collection=mw243.restaurants):
    if projection:
        cursor = collection.find(query, projection).limit(limit)
    else:
        cursor = collection.find(query).limit(limit)
    return pd.DataFrame(list(cursor))

<img src="exercise.png">

## Exercise 5.1 <img src="mongodb.png" width=120 align="right">

# Aggregation Framwork Example
## Setting
Set up a collection "things" that contains several objects with the nested array "tags".

In [None]:
result = mw243.things.insert_many([{"x": 1, "tags": ["dog", "cat"]},
                                 {"x": 2, "tags": ["cat"]},
                                 {"x": 2, "tags": ["mouse", "cat", "dog"]},
                                 {"x": 3, "tags": []},
                                 {"x": 4, "tags": ["mouse", "cat"]}])

result.inserted_ids

This is the collection's content:

In [None]:
execute_query_return_df(collection=mw243.things)

## Defining the first pipeline
Task: count the occurrences of each tag for those documents which have an value less than or equal 3 for the key "x".

In [None]:
pipeline = [
    {"$match" : {"x" : {"$lte" : 3} } },
    {"$unwind": "$tags"},
    {"$group": {"_id": "$tags", "count": {"$sum": 1}}}
]

pprint(list(mw243.things.aggregate(pipeline)))

## Defining the pipeline in NoSQLBooster
<code>
var pipeline = [
    { $match : {x : {$lte : 3} } },
    { $unwind : "$tags"},
    { $group : {_id: "$tags", count: {$sum : 1}}}


db.things.aggregate(pipeline)
</code>

<img src="exercise.png">

# Exercise 5.2 <img src="mongodb.png" width=120 align="right">


# Preparing collection "orders"
We want to store "IsoDates" - these need to be in the Python datetime format. We can use the dateutil parser for this task

In [None]:
from dateutil.parser import parse

parse("2012-11-02T17:04:11.102Z")

In [None]:
result = mw243.orders.insert_many([
    { "_id": 1, "cust_id": "abc1", "ord_date": parse("2012-11-02T17:04:11.102Z"), "status": "A", "amount": 50 },
    { "_id": 2, "cust_id": "xyz1", "ord_date": parse("2013-10-01T17:04:11.102Z"), "status": "A", "amount": 100 },
    { "_id": 3, "cust_id": "xyz1", "ord_date": parse("2013-10-12T17:04:11.102Z"), "status": "D", "amount": 25 },
    { "_id": 4, "cust_id": "xyz1", "ord_date": parse("2013-10-11T17:04:11.102Z"), "status": "D", "amount": 125 },
    { "_id": 5, "cust_id": "abc1", "ord_date": parse("2013-11-12T17:04:11.102Z"), "status": "A", "amount": 25 }
])

result.inserted_ids

In [None]:
execute_query_return_df(collection=mw243.orders)

## Transferred pipeline

In [None]:
pipeline = [ 
    # PLACE YOUR PIPELINE STAGES HERE
]

pprint(list(mw243.orders.aggregate(pipeline)))

# Helper method


In [None]:
def execute_pipeline(pipeline, collection=mw243.orders):
    return list(collection.aggregate(pipeline))

# Matching documents
See https://docs.mongodb.com/manual/reference/operator/aggregation/match/<br>
Create a pipeline that returns all documents that have the status "A".

In [None]:
pipeline = [ 
    { "$match": {"status" : "A"} } 
]
pprint(execute_pipeline(pipeline))

Create a pipeline that returns all documents that have an amount of 50 or more.

In [None]:
pipeline = [ 
    { "$match": {"amount" : {"$gte" : 50} } } 
]
execute_pipeline(pipeline)

<img src="exercise.png">

# Exercise 5.3 <img src="mongodb.png" width=120 align="right">


### Connect to a cluster provided by MongoDB 

In [None]:
connection_string = "mongodb://m001-student:m001-mongodb-basics@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin&retryWrites=true"

client_mongodb = MongoClient(connection_string)
agg_db = Database(client_mongodb, 'aggregations')  # connect to the aggregations

In [None]:
agg_db.list_collection_names()

In [None]:
pipeline = [ 
    # PLACE YOUR PIPELINE STAGES HERE
    { "$limit" : 30}  # please do not remove this pipeline stage
]
result = execute_pipeline(pipeline, collection=agg_db.movies)

In [None]:
len(result)  # should be 23

In [None]:
pprint(result)

# Projecting Documents
See https://docs.mongodb.com/manual/reference/operator/aggregation/project/
## Selecting fields

In [None]:
pipeline = [ 
    { "$match": {
        "imdb.rating" : {"$gte" : 7}        
    } },
    { "$project" : {"_id" : 0, "title" : 1, "genres" : 1 } },
    { "$limit" : 10}  # please do not remove this pipeline stage
]
pd.DataFrame(execute_pipeline(pipeline, collection=agg_db.movies))

## Adding computed fields
Calculating the size of an array and splitting a string using the delimiter " ".<br>
See
- https://docs.mongodb.com/manual/reference/operator/aggregation/split/
- https://docs.mongodb.com/manual/reference/operator/aggregation/size/

In [None]:
pipeline = [ 
    { "$match": {
        "imdb.rating" : {"$gte" : 7}        
    } },
    { "$project" : {"_id" : 0, "title" : 1, "genres" : 1, 
                    "no_genres" : {"$size" : "$genres"},
                    "title_words" : {"$split" : ["$title", " "]}
    } },
    { "$limit" : 10}  # please do not remove this pipeline stage
]
pd.DataFrame(execute_pipeline(pipeline, collection=agg_db.movies))

## Using a match stage after a project stage

In [None]:
pipeline = [ 
    { "$match": {
        "imdb.rating" : {"$gte" : 7}        
    } },
    { "$project" : {"_id" : 0, "title" : 1, "genres" : 1, 
                    "no_genres" : {"$size" : "$genres"},
                    "title_words" : {"$split" : ["$title", " "]}
    } },
    { "$match": {
        "no_genres" : {"$gte" : 2}        
    } },
    { "$limit" : 10}  # please do not remove this pipeline stage
]
pd.DataFrame(execute_pipeline(pipeline, collection=agg_db.movies))

<img src="exercise.png">

# Exercise 5.4 <img src="mongodb.png" width=120 align="right">


In [None]:
pipeline = [ 
    # PLACE YOUR PIPELINE STAGES HERE
    { "$limit" : 30}  # please do not remove this pipeline stage
]
result = execute_pipeline(pipeline, collection=agg_db.movies)

In [None]:
len(result)

In [None]:
pd.DataFrame(result)

<img src="exercise.png">

# Exercise 5.5 <img src="mongodb.png" width=120 align="right">


In [None]:
pipeline = [ 
    # PLACE YOUR PIPELINE STAGES HERE
    { "$limit" : 10000}  # please do not remove this pipeline stage
]
result = execute_pipeline(pipeline, collection=agg_db.movies)

In [None]:
# please only call pd.DataFrame(result) if the limit is <100
len(result)

# Cursor-like Stages
## Counting

In [None]:
pipeline = [ 
    { "$match": {
        "imdb.rating" : {"$gte" : 7},
        "genres" : { "$nin" : ["Crime", "Horror"] },
        "rated" : { "$in" : ["PG", "G"] },
        "languages" : { "$all" : [ "English", "Japanese"] }
    } },    
    { "$count" : "numberofdocuments"} # NEW STAGE
]
result = execute_pipeline(pipeline, collection=agg_db.movies)
pprint(result)

## Sorting

In [None]:
pipeline = [ 
    { "$match": {
        "imdb.rating" : {"$gte" : 7},
        "genres" : { "$nin" : ["Crime", "Horror"] },
        "rated" : { "$in" : ["PG", "G"] },
        "languages" : { "$all" : [ "English", "Japanese"] }
    } },
    { "$project" : { "_id":0, "title":1, "rating":"$imdb.rating"} }, # here, we also project rating
    { "$sort" : SON([ ("rating", -1),  ("title", 1) ] ) },  # NEW STAGE - the SON yields an ordered mapping!
    { "$limit" : 50}  # please do not remove this pipeline stage
]
result = execute_pipeline(pipeline, collection=agg_db.movies)
pd.DataFrame(result)

## Skipping documents

In [None]:
pipeline = [ 
    { "$match": {
        "imdb.rating" : {"$gte" : 7},
        "genres" : { "$nin" : ["Crime", "Horror"] },
        "rated" : { "$in" : ["PG", "G"] },
        "languages" : { "$all" : [ "English", "Japanese"] }
    } },
    { "$project" : { "_id":0, "title":1, "rating":"$imdb.rating"} }, # here, we also project rating
    { "$sort" : SON([ ("rating", -1),  ("title", 1) ] ) },  # the SON yields an ordered mapping!
    { "$skip" : 10 }, # THIS IS THE NEW STAGE
    { "$limit" : 50}  # please do not remove this pipeline stage
]
result = execute_pipeline(pipeline, collection=agg_db.movies)
pd.DataFrame(result)

## Limiting the number of output documents
We used this all the time already.

In [None]:
pipeline = [ 
    { "$match": {
        "imdb.rating" : {"$gte" : 7},
        "genres" : { "$nin" : ["Crime", "Horror"] },
        "rated" : { "$in" : ["PG", "G"] },
        "languages" : { "$all" : [ "English", "Japanese"] }
    } },
    { "$project" : { "_id":0, "title":1, "rating":"$imdb.rating"} }, # here, we also project rating
    { "$sort" : SON([ ("rating", -1),  ("title", 1) ] ) },  # the SON yields an ordered mapping!
    { "$skip" : 10 },
    { "$limit" : 50}  # THIS IS THE NW STAGE
]
result = execute_pipeline(pipeline, collection=agg_db.movies)
pd.DataFrame(result)

# Accumulator Expressions - Grouping

In [None]:
pipeline = [ 
    { "$match": {
        "year" : {"$type" : 16}  # only integer values
    } },
    { "$project" : { "_id":0, "year":1, "title":1, "rating":"$imdb.rating"} }, # here, we also project rating
    # NEW STAGE:
    { "$group" : {  
        "_id" : "$year",  # the _id value (genres) is comparable to a GROUP BY "genres" in SQL
        "averageRating" : { "$avg" : "$rating"}  # we access "rating" in a calculating manner - $ required
    } },
    { "$sort" : {"_id":-1} }  # sort using the year descending (SON not necessary, only one field)
]
result = execute_pipeline(pipeline, collection=agg_db.movies)
pd.DataFrame(result)

In [None]:
pipeline = [ 
    { "$match": {
        "imdb.rating" : {"$gte" : 7},
        "genres" : { "$nin" : ["Crime", "Horror"] },
        "rated" : { "$in" : ["PG", "G"] },
        "languages" : { "$all" : [ "English", "Japanese"] }
    } },
    { "$project" : { "_id":0, "year":1, "rating":"$imdb.rating"} }, # here, we also project rating
    # NEW STAGE
    { "$group" : {  
        "_id" : "$year",  # the _id value (genres) is comparable to a GROUP BY "genres" in SQL
        "averageRating" : { "$avg" : "$rating"}  # we access "rating" in a calculating manner - $ required
    } },
    { "$sort" : {"averageRating":-1} }  # sort using the year descending (SON not necessary, only one field)    
]
result = execute_pipeline(pipeline, collection=agg_db.movies)
pd.DataFrame(result)

<img src="exercise.png">

# Exercise 5.6 <img src="mongodb.png" width=120 align="right">


In [None]:
pipeline = [
    # PLACE YOUR PIPELINE STAGES HERE
]

result = execute_pipeline(pipeline, collection=agg_db.movies)
pd.DataFrame(result)