### Who Tweeted Most?

1. Group tweets by user
2. Count each user's tweets
3. Sort into descentding order
4. Select user at top

In [6]:
import json 
import pprint

def insert_data(data, db):
    for a in data:
        db.twitter.insert(a)

if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples
    
    # Available here: http://content.udacity-data.com/ud032/twitter/twitter.json.zip
    with open('twitter.json', 'r') as f:
        ## json.loads() takes a string, while json.load() takes a file-like object.
        ## http://stackoverflow.com/questions/11568246/loading-several-text-files-into-mongodb-using-pymongo
        for tweet in f.readlines():
            db.twitter.insert(json.loads(tweet))
    pprint.pprint(db.twitter.find_one())



{'_id': ObjectId('5b101612a24d683bec191b47'),
 'contributors': None,
 'coordinates': None,
 'created_at': 'Thu Sep 02 18:11:23 +0000 2010',
 'entities': {'hashtags': [], 'urls': [], 'user_mentions': []},
 'favorited': False,
 'geo': None,
 'id': 22819396900,
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_user_id': None,
 'place': None,
 'retweet_count': None,
 'retweeted': False,
 'source': 'web',
 'text': 'eu preciso de terminar de fazer a minha tabela, está muito foda **',
 'truncated': False,
 'user': {'contributors_enabled': False,
          'created_at': 'Fri Jul 03 21:44:05 +0000 2009',
          'description': 'só os loucos sabem (:',
          'favourites_count': 1,
          'follow_request_sent': None,
          'followers_count': 102,
          'following': None,
          'friends_count': 73,
          'geo_enabled': False,
          'id': 53507833,
          'lang': 'en',
          'listed_count': 0,
          'location': '',
          'nam

## $$$group,  $sort  $$

In [1]:
from pymongo import MongoClient
import pprint

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client[db_name]
    return db

In [2]:
def make_pipeline():
    pipeline = [{"$group": {"_id": "$source",
                            "count": {"$sum": 1}}},
                 {"$sort": {"count": -1}},
                 {"$limit" : 5 }]
    return pipeline

In [3]:
def tweet_sources(db, pipeline):
    result = db.tweets.aggregate(pipeline)
    return result

In [4]:
if __name__ == '__main__':
    db = get_db('example_tweets')
    pipeline = make_pipeline()
    result = tweet_sources(db, pipeline)
    import pprint
    pprint.pprint(result)

<pymongo.command_cursor.CommandCursor object at 0x1034c55f8>


## $$$project,  $match,  $skip,  $limit,  $unwind  $$

### Who has the highest follower/friends ratio?

In [7]:
def highest_ratio():
    result = db.tweets.aggregate([
        {"$match": {"user.friends_count": {"$gt": 0},
                   "user.follower_count": {"$gt": 0}}},
        {"$project": {"ratio":{"$divide": ["$user.followers_count",
                                          "$user.friends_count"]},
                     "screen_name": "$user.screen_name"}},
        {"$sort": {"ratio": -1}},
        {"$limit": 1}
    ])
    
    return result

### Use $project to:
- Include fields from the original document
- Insert computed fields
- Rename fields
- Create fields that hold sub-documents

In [8]:
def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

In [9]:
def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [ 
        {"$match": {"user.time_zone": "Brasilia",
                   "user.statuses_count": {"$gte": 100}}},
        {"$project": {"followers": "$user.followers_count",
                      "screen_name": "$user.screen_name",
                      "tweets": "$user.statuses_count"}},
        {"$sort": {"followers": -1}},
        {"$limit": 1}         
        ]
    return pipeline

In [10]:
def aggregate(db, pipeline):
    return [doc for doc in db.tweets.aggregate(pipeline)]

In [12]:
if __name__ == '__main__':
    db = get_db('twitter')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    pprint.pprint(result)

[]


### Why use $project?

#### For example, to answer the question "Who included the most user mentions?"

In [13]:
def user_mentions():
    result = db.tweets.aggregate([
        {"$unwind": "$entities.user_mentions"},
        {"$group": {"_id": "$user.screen_name",
                   "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {"$limit": 1}
    ])
    
    return result

In [14]:
if __name__ == '__main__':
    result = user_mentions()
    pprint.pprint(result)

<pymongo.command_cursor.CommandCursor object at 0x10356d940>


In [15]:
## Another example
#For this exercise, let's return to our cities infobox dataset. The question we would like you to answer
#is as follows:  Which region or district in India contains the most cities? (Make sure that the count of
#cities is stored in a field named 'count'; see the assertions at the end of the script.)

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$unwind": "$isPartOf"},
                {"$match": {"country": "India"}},
                {"$group": {"_id": "$isPartOf",
                            "count": {"$sum": 1}}},
                {"$sort": {"count": -1}}
                ]
    return pipeline

### $group operation:

- $sum

- $first

- $last

- $max

- $min

- $avg

In [16]:
def hashtag_retweet_avg():
    result = db.tweets.aggregate([
        {"$unwind": "$entities.hastags"},
        {"$group": {"_id": "$entities.hastags.text",
                   "retweet_avg": {"$avg": "$retweet_count"}}},
        {"$sort": {"retweet_avg": -1}}
    ])
    
    return result

## Arrays: 

- $push 

- $addToSet

In [17]:
def hashtag_retweet_avg():
    result = db.tweets.aggregate([
        {"$unwind": "$entities.hastags"},
        {"$group": {"_id": "$entities.hastags.text",
                   "unique_hashtags": {
                       "$addToSet": "$$entities.hashtags.text"}}},
        {"$sort": {"_id": -1}}
    ])
    
    return result

In [18]:
def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [
        {"$group": {"_id": "$user.screen_name",
                    "tweet_texts": {"$push": "$text"},
                    "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {"$limit": 5}
        ]
    return pipeline

### Who has mentioned the most unique users?

In [19]:
def unique_user_mentions():
    result = db.tweets.aggregate([
        {"$unwind": "$entities.user_mentions"},
        {"$group": {
            "_id": "$user.screen_name",
            "mset": {
                "$addToSet": "$entities.user_mentions.screen_name"
            }
        }},
        {"$unwind": "$mset"},
        {"$group": {"_id": "$_id",
                   "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {"$limit": 10}
    ])
    
    return result

In [20]:
## Another example:
def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{"$match": {"country": "India"}},
                # First, match India as the country of interest; data contains world data.
                {"$unwind": "$isPartOf"},
                # Unwind regions; some cities belong to multiple regions.
                {"$group": {"_id": "$isPartOf",
                            # Now group on each region.
                            "totPop": {"$sum": "$population"},
                            # Sum up the population of all of the cities for each region.
                            "count": {"$sum": 1},
                            # Count the number of times each region shows up.
                 "average": {"$avg": "$population"}}},
                # Create an average for each region.
                {"$group": {"_id": "India Regional City Population Average",
                # Now group by a constant to group everything together.
                 "avg": {"$avg": "$average"}}}]
                # And finally, get an average of the average region populations.
    return pipeline

## Indexes