In [1]:
# !pip install pymongo

In [2]:
from pymongo import MongoClient

In [3]:
matplotlib inline

In [4]:
mongo_client = MongoClient('18.236.138.158', 27016)
database_reference = mongo_client.twitter

In [6]:
database_reference.collection_names()

['users', 'instructor_test_group', 'tweets']

In [7]:
collection_reference = database_reference.instructor_test_group

In [8]:
collection_reference.count()

20000

In [9]:
cursor_sampl = collection_reference.aggregate([{'$sample': {'size': 20}}])

In [10]:
len(list(cursor_sampl)) 

20

| | | | | |
|-|-|-|-|-|
| `_id`        | `truncated`                 |`user`            |`extended_tweet` | `favorited`                   |
| `created_at` | `in_reply_to_status_id`     |`geo`             |`quote_count`    | `retweeted`                   |
| `id`         | `in_reply_to_status_id_str` |`coordinates`     |`reply_count`    | `filter_level`                |
| `id_str`     | `in_reply_to_user_id`       |`place`           |`retweet_count`  | `lang`                        |
| `text`       | `in_reply_to_user_id_str`   |`contributors`    |`favorite_count` | `timestamp_ms`                |
| `source`     | `in_reply_to_screen_name`   |`is_quote_status` |`entities`       |                               |
























![](https://www.evernote.com/l/AAEO7gpKcKdI5YJeGrni4GhdlBiBWdTa3YgB/image.png)

![](https://i.imgflip.com/245tp9.jpg)

## The Aggregation Pipeline

A call to the aggregation framework defines a pipeline (figure 6.1), the **aggregation pipeline**, where the output from each step in the pipeline provides input to the next step. Each step executes a single operation on the input documents to transform the input and generate output documents.

![](https://www.evernote.com/l/AAGxerRxKLZNFrjqxlYK2HPz1R11tr95FFkB/image.png)

### Useful Aggregation Pipeline Operations

- `$project` // Specify fields to be placed in the output document. Column Filtering
- `$match` // Select documents to be processed, similar to find(). Row filtering. 
- `$limit` // Limit the number of documents to be passed to the next step.
- `$skip` // Skip a specified number of documents.
- `$unwind` // Expand an array, generating one output document for each array entry.
- `$group` // Group documents by a specified key.
- `$sort` // Sort documents.
- `$geoNear` // Select documents near a geospatial location.
- `$out` // Write the results of the pipeline to a collection (new in v2.6).
- `$redact` // Control access to certain data (new in v2.6).

In [11]:
PROJECT = "$project"
MATCH = "$match"
LIMIT = "$limit"
UNWIND = "$unwind"
GROUP = "$group"
SORT = "$sort"
COUNT = "$count"

In [12]:
test_group = database_reference.instructor_test_group

In [13]:
not_empty = { "$ne" : None } # you want the tweets do not have None. Basically filters it out. 

cursor = test_group.aggregate([
    { MATCH : { "geo" : not_empty }}, # find "geo"
    { COUNT : "geo"} # count of "geo"
])

In [14]:
next(cursor)

{'geo': 2952}

In [18]:
#alternative
match_non_null_geo = { MATCH : { "geo" : not_empty }}
count_geo = { COUNT : "geo"}

dag_count_non_null_geo = [
    match_non_null_geo,
    count_geo
]

dag_count_non_null_geo

[{'$match': {'geo': {'$ne': None}}}, {'$count': 'geo'}]

In [19]:
next(test_group.aggregate(dag_count_non_null_geo))

{'geo': 2952}

### Group Template

    { $group: { _id: <expression>, <field1>: { <accumulator1> : <expression1> }, ... } }
    
#### Accumulators

- `$sum`
- `$avg`
- `$first`
- `$last`
- `$max`
- `$min`
- `$stdDevPop`
- `$stdDevSamp`

In [21]:
greater_than_10 = { "$gt" : 10 }
sum_1 = { "$sum" : 1 } # count one for each occurance. 

def group_and_count(key):
    return { GROUP : {
                 "_id"   : key,
                 "count" : sum_1
                }
           }

match_count_gt_10 = { MATCH : { "count" : greater_than_10 } }

sort_by_count_descending = { SORT : { "count" : -1 } }

def limit(val):
    return { LIMIT : val }

In [23]:
list(test_group.aggregate(
    [
        group_and_count('$lang'),
        match_count_gt_10, 
        sort_by_count_descending,
#        limit(10)
    ]
))


[{'_id': 'en', 'count': 16996},
 {'_id': 'und', 'count': 1815},
 {'_id': 'es', 'count': 295},
 {'_id': 'tl', 'count': 126},
 {'_id': 'fr', 'count': 121},
 {'_id': 'pt', 'count': 76},
 {'_id': 'ht', 'count': 66},
 {'_id': 'ja', 'count': 61},
 {'_id': 'ar', 'count': 49},
 {'_id': 'it', 'count': 48},
 {'_id': 'in', 'count': 41},
 {'_id': 'et', 'count': 35},
 {'_id': 'tr', 'count': 24},
 {'_id': 'nl', 'count': 21},
 {'_id': 'ko', 'count': 20},
 {'_id': 'fa', 'count': 20},
 {'_id': 'lv', 'count': 20},
 {'_id': 'hi', 'count': 16},
 {'_id': 'de', 'count': 15},
 {'_id': 'fi', 'count': 15},
 {'_id': 'cy', 'count': 13},
 {'_id': 'no', 'count': 12},
 {'_id': 'da', 'count': 11}]

In [24]:
not_an_empty_array = { "$ne" : [] } #look for things that has things in the array. 
match_non_empty_hashtag_arrays = { MATCH : { "entities.hashtags" : not_an_empty_array } }
project_to_text_only = { PROJECT : { "text" : "$entities.hashtags.text", "_id" :0 } } #.documents within a document. if id:0 you dont want the id.
unwind_text = { UNWIND : "$text" }

list(test_group.aggregate(
    [
        match_non_empty_hashtag_arrays, #find tweets with 3 hashtags
        project_to_text_only, # filter only the 3 hashtags
        unwind_text, # split hashtagst to its own section. 
        limit(10)
    ]
))


[{'text': 'photos'},
 {'text': 'Artist'},
 {'text': 'LosAngeles'},
 {'text': 'Accounting'},
 {'text': 'Job'},
 {'text': 'Jobs'},
 {'text': 'Hiring'},
 {'text': 'CareerArc'},
 {'text': 'sanrio'},
 {'text': 'turquoise'}]

In [25]:
list(test_group.aggregate(
    [
        match_non_empty_hashtag_arrays,
        project_to_text_only,
        unwind_text,
        group_and_count('$text'),
        match_count_gt_10, 
        sort_by_count_descending,
        limit(10)
    ]
))

[{'_id': 'job', 'count': 395},
 {'_id': 'Hiring', 'count': 308},
 {'_id': 'LosAngeles', 'count': 286},
 {'_id': 'CareerArc', 'count': 240},
 {'_id': 'hiring', 'count': 149},
 {'_id': 'Job', 'count': 107},
 {'_id': 'Jobs', 'count': 107},
 {'_id': 'earthquake', 'count': 67},
 {'_id': 'LA', 'count': 56},
 {'_id': 'losangeles', 'count': 49}]

In [26]:
job_hashtags = ['job', 'jobs', 'hiring', 'careerarc'] # dont want jobs ads. 
location_hashtags = ['california', 'losangeles', 'la', 'santamonica', 'glendale', 'paloalto'] # dont want locations. 
project_to_lower = { PROJECT : { "text" : {"$toLower" : "$text"} } } #dont want lower case. 
match_not_in_bad = { MATCH : { "_id" : { "$nin" : job_hashtags + location_hashtags}}} #match tags that not in these list. 

list(test_group.aggregate(
    [
        match_non_empty_hashtag_arrays,
        project_to_text_only,
        unwind_text,
        project_to_lower,
        group_and_count('$text'),
        match_not_in_bad,
        match_count_gt, 
        sort_by_count_descending,
        limit(50)
    ]
))

#top 50 hashtags after all the filtering. 

[{'_id': 'earthquake', 'count': 67},
 {'_id': 'goldenglobes', 'count': 56},
 {'_id': 'quake', 'count': 46},
 {'_id': 'art', 'count': 40},
 {'_id': 'healthcare', 'count': 38},
 {'_id': 'superbowl', 'count': 28},
 {'_id': 'retail', 'count': 26},
 {'_id': 'sales', 'count': 25},
 {'_id': 'rn', 'count': 25},
 {'_id': 'marketing', 'count': 25},
 {'_id': 'gonancygo', 'count': 24},
 {'_id': 'hospitality', 'count': 23},
 {'_id': 'grammys', 'count': 22},
 {'_id': 'repost', 'count': 22},
 {'_id': 'it', 'count': 21},
 {'_id': 'releasethememo', 'count': 21},
 {'_id': 'timesup', 'count': 19},
 {'_id': 'nsng', 'count': 19},
 {'_id': 'love', 'count': 19},
 {'_id': 'clerical', 'count': 18},
 {'_id': 'businessmgmt', 'count': 17},
 {'_id': 'tv', 'count': 17},
 {'_id': 'hollywood', 'count': 16},
 {'_id': 'trumpshutdown', 'count': 16},
 {'_id': 'script', 'count': 16},
 {'_id': 'beverlyhills', 'count': 15},
 {'_id': 'dtla', 'count': 15},
 {'_id': 'comedy', 'count': 15},
 {'_id': 'actorslife', 'count': 14},
