# Census 2016 Twitter Insights

In [13]:
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from pymongo import MongoClient 
import json
import re
from aylienapiclient import textapi
from unidecode import unidecode
import googlemaps


#text analytics imports
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer


PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()
LEMMER = WordNetLemmatizer()
tweet_tokenizer = TweetTokenizer()


#Twitter API credentials
auth = tweepy.OAuthHandler('YOUR_INFO', 'YOUR_INFO')
auth.set_access_token('YOUR_INFO', 'YOUR_INFO')

consumer_key = 'YOUR_INFO'
consumer_secret = 'YOUR_INFO'
access_token = 'YOUR_INFOYOUR_INFO'
access_token_secret = 'YOUR_INFO'

#Google Maps API key
gmaps = googlemaps.Client(key='YOUR_INFO')

#AYLIEN credentials
#rjshanahan
my_aylien = textapi.Client("YOUR_INFO", "YOUR_INFO")



#MongoDB connection
client = MongoClient('YOUR_INFO')
db = client.twitter01
collection = db.tweets_census 


#regex patterns
problemchars = re.compile(r'[\[=\+/&<>;:!\\|*^\'"\?%$.@)Â°#(_\,\t\r\n0-9-â€”\]]')
url_finder = re.compile(r'http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
emojis = re.compile("["
        u"\U0001F600\\-\U0001F64F"  # emoticons
        u"\U0001F300\\-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680\\-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0\\-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
# username = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)')
username = re.compile(r'(@)\w+( )')
retweeted = re.compile(r'^rt ')
# hashtag = re.compile(r'#(\w+)')
reempty = re.compile(r'^$|\s+')


#function to flatten nested dictionaries in tweet JSON object
def flatten(indict, current_key=None, outerdict=None):
    if outerdict is None:
        outerdict = {}
    for key, value in indict.items():
        newkey = current_key + '__' + key if current_key else key
        if type(value) is not dict:
            outerdict[newkey] = value
        else:
            flatten(value, current_key=newkey, outerdict=outerdict)
    return outerdict



#function to lookup and count occurences of specific words in Tweet body
def tweet_cleaner(bodyText):
    
    #append nltk libraries
    nltk.data.path.append("/Users/rjshanahan/nltk_data")
    
    
    tokens = tweet_tokenizer.tokenize(bodyText)
    lowercased = [t.lower() for t in tokens]
    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION and not letter.isdigit()])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    stemmed = [STEMMER.stem(w) for w in no_stopwords]
    lemmed = [LEMMER.lemmatize(w) for w in stemmed]
    no_links = [w for w in lemmed if (not 'http' in w) and len(w)>2]
            
    return no_links


def transform_tweet(line):
    return re.compile('#\w+ ').sub('', re.compile('RT @\w+: ').sub('', line, count=1)).strip()



#define class for streaming Twitter data
class StdOutListener(StreamListener):

    def on_data(self, tweet_data):   

        #define objects: https://dev.twitter.com/overview/api/tweets
        tweet = json.loads(tweet_data)
        tweet = flatten(tweet)

        #elements of interest
        id_str = tweet["id_str"]
        created_at = tweet["created_at"]
        id_str = tweet["id_str"]
#        text = tweet["text"].encode('ascii', 'ignore')
#         text = unidecode(tweet["text"])
        text = tweet["text"]
#         text_sentiment = my_aylien.Sentiment(text)
        text_token = tweet_cleaner(text)
        text_clean = retweeted.sub('', stop.sub('', problemchars.sub('', emojis.sub('', url_finder.sub('', username.sub('', text.encode('ascii', 'ignore').lower().strip()))))))
#         text_clean = stop.sub('', problemchars.sub('', url_finder.sub('', username.sub('', text.encode('utf-8').lower().strip()))))
        coord = tweet["coordinates"]
        fav = tweet["favorite_count"]
        rtwt = tweet["retweet_count"]
        user = tweet["user__name"]
        user_follower = tweet["user__followers_count"]
        user_friend = tweet["user__friends_count"]
        user_tweets = tweet["user__listed_count"]
        user_location = tweet["user__location"]
        user_statuses = tweet["user__statuses_count"]
        user_screen_name = tweet["user__screen_name"]
        ent_hashtag = tweet["entities__hashtags"]
        ent_user_mention = tweet["entities__user_mentions"]
        place_country = [tweet["place__country"] if "place__country" in tweet else "no_geo"]
        place_countrycode = [tweet["place__country_code"] if "place__country_code" in tweet else "no_geo"]
        place_name = [tweet["place__name"] if "place__name" in tweet else "no_geo"]
        place_type = [tweet["place__place_type"] if "place__place_type" in tweet else "no_geo"]
        
        
        #create dict to insert into MongoDB
        obj = { 
            "id_str":id_str,
            "created_at":created_at,
            "id_str":id_str,
            "text":text,
#             "text_sentiment":text_sentiment,
            "text_token":text_token,
            "text_clean":text_clean,
            "coordinates":coord,
            "favorites":fav,
            "retweets":rtwt,
            "user":user,
            "user_follower":user_follower,
            "user_friend":user_friend,
            "user_tweets":user_tweets,
            "user_location":user_location,
            "user_statuses":user_statuses,
            "user_screen_name":user_screen_name,
            "ent_hashtag":ent_hashtag,
            "ent_user_mention":ent_user_mention,
            "place_country":place_country,
            "place_countrycode":place_countrycode,
            "place_name":place_name,
            "place_type":place_type
              }
        
        #insert into MongoDB
        tweetind = collection.insert_one(obj).inserted_id

        print(obj)

        return True

    def on_error(self, status):

        #error 420 = API throttling - too many connections usually
        print(status)




#filter for #MyCensus
def streamer(consumer_key, consumer_secret, access_token, access_token_secret):
    
    #This handles Twitter authetification and the connection to Twitter Streaming AP
    if __name__ == '__main__':
    
        l = StdOutListener()
    
    #Twitter API access with streaming class
    auth = OAuthHandler(consumer_key, consumer_secret)

    auth.set_access_token(access_token, access_token_secret)

    stream = Stream(auth, l)
    
    try:
        stream.filter(track=['census'])
    except:  
        stream.disconnect()
        
      

        
#call streamer function
print('Existing Census Tweets Collection Size: ' + str(db.tweets_census.count()) + '\n')
streamer(consumer_key, consumer_secret, access_token, access_token_secret)
print('\nNew Census Tweets Collection Size: ' + str(db.tweets_census.count()) + '\n')




# SENTIMENT ANALYSIS
#add new empty field
db.tweets_census.update_many({"text_sentiment":{"$exists" : 0}},
                        {"$set" : {"text_sentiment":1}},
                        False)

    
#initialise bulk UPDATE process for "text_sentiment" field
bulk = db.tweets_census.initialize_ordered_bulk_op()
counter = 0

for record in db.tweets_census.find( 
                      {"text":{"$exists" : 1},
                      "text_sentiment":{"$eq" : 1}},                #'where' clause
                      {"text":1,
                       "_id":1},
                modifiers={"$snapshot": True}):                     #"snapshot" means the _id ordering is kept
    
    my_text = record["text"]
    
    my_sentiment = my_aylien.Sentiment(my_text.encode('utf-8'))     #call the AYLIEN API
    
    # now process in bulk
    # calc value first
    bulk.find({ '_id': record['_id'] }).update({ '$set': { 'text_sentiment': my_sentiment } })
    counter =+ 1
    
    if ( counter % 1000 == 0 ):
        bulk.execute()
        bulk = db.tweets_census.initialize_ordered_bulk_op()

if ( counter % 1000 != 0 ):
    bulk.execute()

    
    
    
    
#GEOCODE PLACE and USER_LOCATION NAME VIA GOOGLE MAPS API
#function to geocode PLACE_NAME
def lat_lng_finder(gmaps, place_name):
        
    try:
        geocode_result = gmaps.geocode(place_name)
    
        loc_dict = {
            'latitude': geocode_result[0]['geometry']['location']['lat'],
            'longitude': geocode_result[0]['geometry']['location']['lng']
                }
    except:
        loc_dict = {
            'latitude': "",
            'longitude': ""
                }
    
    return loc_dict
    
    
    
    
#add new empty field - set to '1' to make it identifiable
db.tweets_census.update_many({
#         "lat_lon" : {"$exists" : 0},
                              "lat_lon_loc" : {"$exists" : 0}
                             },
                        {
#         "$set" : {"lat_lon":1},
                         "$set" : {"lat_lon_loc":1}
                        },
                        False)

    
#initialise bulk UPDATE process for "text_sentiment" field
bulk = db.tweets_census.initialize_ordered_bulk_op()
counter = 0
    

for record in db.tweets_census.find( 
                    { "user_location" : {"$exists" : 1},
                      "user_location" : {"$ne" : "None"},
                      "user_location" : {"$ne" : 1},
#                       "place_name" : {"$exists" : 1},
#                       "place_name": {"$ne" : "no_geo"},
#                       "lat_lon":{"$eq" : 1},
                      "lat_lon_loc":{"$eq" : 1}
                    },                #'where' clause
                        {"user_location" : 1,
#                          "place_name" : 1,
                         "_id" : 1},
                modifiers={"$snapshot": True}):                     #"snapshot" means the _id ordering is kept
    
    my_loc = record["user_location"]
#     my_place = record["place_name"]
    
    my_lat_lon_loc = lat_lng_finder(gmaps, my_loc)
#     my_lat_lon = lat_lng_finder(gmaps, my_place)
    
    # now process in bulk
    # calc value first
    bulk.find({ '_id': record['_id'] }).update({ '$set': { 'lat_lon_loc': my_lat_lon_loc }}),
#                                                '$set': { 'lat_lon': my_lat_lon }})
    counter =+ 1
    
    if ( counter % 1000 == 0 ):
        bulk.execute()
        bulk = db.tweets_census.initialize_ordered_bulk_op()

if ( counter % 1000 != 0 ):
    bulk.execute()

#set '1' fields to NULL for those that weren't coded
db.tweets_census.update_many({"lat_lon_loc":{"$eq":1}},
#                               "lat_lon":{"$eq":1}},
#                         {"$set" : {"lat_lon":"null"},
                        {"$set" : {"lat_lon_loc":"null"}},
                        False)


Existing Census Tweets Collection Size: 1370

{'text': u"RT @veritygorman: .@ABSCensus officials will visit Robinvale today ahead of next month's Census. There have been concerns about under repor\u2026", 'place_countrycode': ['no_geo'], 'place_name': ['no_geo'], 'user_screen_name': u'ABCMilduraSwanH', 'ent_user_mention': [{u'id': 288610842, u'indices': [3, 16], u'id_str': u'288610842', u'screen_name': u'veritygorman', u'name': u'Verity Gorman'}, {u'id': 172532664, u'indices': [19, 29], u'id_str': u'172532664', u'screen_name': u'ABSCensus', u'name': u'Census Australia'}], 'coordinates': None, 'id_str': u'755170555892948993', 'retweets': 0, 'user_friend': 2435, 'text_clean': 'veritygorman officials visit robinvale today ahead next months census concerns repor', 'user_follower': 3298, 'user': u'ABC Mildura SwanHill', 'favorites': 0, 'user_tweets': 101, 'place_country': ['no_geo'], 'ent_hashtag': [], 'text_token': [u'veritygorman', u'abscensu', u'offici', u'visit', u'robinval', u'today', 

<pymongo.results.UpdateResult at 0x10453aeb0>

# Add Additional Fields against Streamed Twitter data
- adds 'clean' text field
- then does sentiment analysis
- then geocodes tweets based on user's location

In [None]:
# TEXT CLEANER
#add new empty field
db.tweets_census.update_many({"text_clean":{"$exists" : 0}},
                        {"$set" : {"text_clean":1}},
                        False)

    
#initialise bulk UPDATE process for "text_sentiment" field
bulk = db.tweets_census.initialize_ordered_bulk_op()
counter = 0

for record in db.tweets_census.find( 
                      {"text":{"$exists" : 1},
                      "text_clean":{"$eq" : 1}},                #'where' clause
                      {"text":1,
                       "_id":1},
                modifiers={"$snapshot": True}):                     #"snapshot" means the _id ordering is kept
    
    my_text = record["text"]    
    
    my_text_clean = retweeted.sub('', stop.sub('', problemchars.sub('', emojis.sub('', url_finder.sub('', username.sub('', my_text.encode('ascii', 'ignore').lower().strip()))))))
    
    
    # now process in bulk
    # calc value first
    bulk.find({ '_id': record['_id'] }).update({ '$set': { 'text_clean': my_text_clean } })
    counter =+ 1
    
    if ( counter % 1000 == 0 ):
        bulk.execute()
        bulk = db.tweets_census.initialize_ordered_bulk_op()

if ( counter % 1000 != 0 ):
    bulk.execute()

    
    




# SENTIMENT ANALYSIS
#add new empty field
db.tweets_census.update_many({"text_sentiment":{"$exists" : 0}},
                        {"$set" : {"text_sentiment":1}},
                        False)

    
#initialise bulk UPDATE process for "text_sentiment" field
bulk = db.tweets_census.initialize_ordered_bulk_op()
counter = 0

for record in db.tweets_census.find( 
                      {"text":{"$exists" : 1},
                      "text_sentiment":{"$eq" : 1}},                #'where' clause
                      {"text":1,
                       "_id":1},
                modifiers={"$snapshot": True}):                     #"snapshot" means the _id ordering is kept
    
    my_text = record["text"]
    
    my_sentiment = my_aylien.Sentiment(my_text.encode('utf-8'))     #call the AYLIEN API
    
    # now process in bulk
    # calc value first
    bulk.find({ '_id': record['_id'] }).update({ '$set': { 'text_sentiment': my_sentiment } })
    counter =+ 1
    
    if ( counter % 1000 == 0 ):
        bulk.execute()
        bulk = db.tweets_census.initialize_ordered_bulk_op()

if ( counter % 1000 != 0 ):
    bulk.execute()

    
    
    
    
    
    
#GEOCODE PLACE and USER_LOCATION NAME VIA GOOGLE MAPS API
#function to geocode PLACE_NAME
def lat_lng_finder(gmaps, place_name):
        
    try:
        geocode_result = gmaps.geocode(place_name)
    
        loc_dict = {
            'latitude': geocode_result[0]['geometry']['location']['lat'],
            'longitude': geocode_result[0]['geometry']['location']['lng']
                }
    except:
        loc_dict = {
            'latitude': "",
            'longitude': ""
                }
    
    return loc_dict
    
    
    
    
#add new empty field - set to '1' to make it identifiable
db.tweets_census.update_many({
#         "lat_lon" : {"$exists" : 0},
                              "lat_lon_loc" : {"$exists" : 0}
                             },
                        {
#         "$set" : {"lat_lon":1},
                         "$set" : {"lat_lon_loc":1}
                        },
                        False)

    
#initialise bulk UPDATE process for "text_sentiment" field
bulk = db.tweets_census.initialize_ordered_bulk_op()
counter = 0
    

for record in db.tweets_census.find( 
                    { "user_location" : {"$exists" : 1},
                      "user_location" : {"$ne" : "None"},
                      "user_location" : {"$ne" : 1},
#                       "place_name" : {"$exists" : 1},
#                       "place_name": {"$ne" : "no_geo"},
#                       "lat_lon":{"$eq" : 1},
                      "lat_lon_loc":{"$eq" : 1}
                    },                #'where' clause
                        {"user_location" : 1,
#                          "place_name" : 1,
                         "_id" : 1},
                modifiers={"$snapshot": True}):                     #"snapshot" means the _id ordering is kept
    
    my_loc = record["user_location"]
#     my_place = record["place_name"]
    
    my_lat_lon_loc = lat_lng_finder(gmaps, my_loc)
#     my_lat_lon = lat_lng_finder(gmaps, my_place)
    
    # now process in bulk
    # calc value first
    bulk.find({ '_id': record['_id'] }).update({ '$set': { 'lat_lon_loc': my_lat_lon_loc }}),
#                                                '$set': { 'lat_lon': my_lat_lon }})
    counter =+ 1
    
    if ( counter % 1000 == 0 ):
        bulk.execute()
        bulk = db.tweets_census.initialize_ordered_bulk_op()

if ( counter % 1000 != 0 ):
    bulk.execute()

#set '1' fields to NULL for those that weren't coded
db.tweets_census.update_many({"lat_lon_loc":{"$eq":1}},
#                               "lat_lon":{"$eq":1}},
#                         {"$set" : {"lat_lon":"null"},
                        {"$set" : {"lat_lon_loc":"null"}},
                        False)


In [69]:
db.tweets_census.count()

3092

# Emoji Analysis
- http://stats.seandolinar.com/emoji-utf-8-and-python/
- https://github.com/seandolinar/socialmediaparse

In [60]:
import pymongo
import re
import os
import pandas as pd
import io


#loads emoji key
##os.chdir('##directory##')##
emoji_key = pd.read_csv('emoji_table.txt', encoding='utf-8', index_col=0)

# emoji_key.head(20)


#intialize emoji count
emoji_key['count'] = 0


emoji_dict = emoji_key['count'].to_dict()


emoji_list = []
tweet_list = collection.find({'text' : { '$exists' : 1 }} )            
for tweet in tweet_list:
    for emoji in emoji_dict.keys():
#         if emoji in tweet['text'] and tweet['text'][:2] != 'RT' :
        if emoji in tweet['text']:
            emoji_list.append(tweet['text'])
#             print(tweet['text'])
            emoji_dict[emoji] += 1
            


#prints number of unique number of tweets with emoji in them                      
# print(len(set(emoji_list)))

#print emoji_dict
# print(tweet_list.count())

#creates and sorts a data frame of a count of emoji
emoji_count = pd.DataFrame(emoji_dict.items(), columns=['emoji', 'count'])
emoji_count.sort_values(by="count", ascending=False).head(20)

#writes output file
# with open('emoji_out.csv', 'w') as f: 
#     emoji_count.to_csv(f, sep=',', index = False, encoding='utf-8') 

Unnamed: 0,emoji,count
535,ðŸ˜‚,20
50,ðŸ‘‡,9
537,ðŸ˜Œ,6
489,ðŸ˜¼,4
536,ðŸ˜,4
917,ðŸŽ‰,4
675,â¤ï¸,3
214,ðŸ’¯,3
543,ðŸ˜Š,2
525,ðŸ˜˜,2


# Bulk Update specific field

In [226]:
#add new empty field
# db.tweets_census.update_many({"text_clean":{"$exists" : 0}},
db.tweets_census.update_many({},
                        {"$set" : {"text_clean":1}},
                        False)


# db.tweets_census.update_many({"lat_lon":{"$eq":1}},
#                         {"$set" : {"lat_lon":"null"}},
#                         False)

    
#initialise bulk UPDATE process for "text_sentiment" field
bulk = db.tweets_census.initialize_ordered_bulk_op()
counter = 0

for record in db.tweets_census.find( 
                      {"text":{"$exists" : 1},
                      "text_clean":{"$eq" : 1}},                #'where' clause
                      {"text":1,
                       "_id":1},
                modifiers={"$snapshot": True}):                     #"snapshot" means the _id ordering is kept
    
    my_text = record["text"]
    
    my_text_clean = stop.sub('', problemchars.sub('', url_finder.sub('', username.sub('', my_text.encode('ascii', 'ignore').lower().strip()))))  #call the AYLIEN API
    
    # now process in bulk
    # calc value first
    bulk.find({ '_id': record['_id'] }).update({ '$set': { 'text_clean': my_text_clean } })
    counter =+ 1
    
    if ( counter % 1000 == 0 ):
        bulk.execute()
        bulk = db.tweets_census.initialize_ordered_bulk_op()

if ( counter % 1000 != 0 ):
    bulk.execute()


# RUN Queries and Aggregations

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pymongo
import pprint as pp
    
    
#mongodb connection function
def get_db(db_name, coll_name):
    from pymongo import MongoClient
    
    #MongoDB connection
    client = MongoClient('YOUR_INFO')
    db = client[db_name]
    coll = db[coll_name]
    
    return db



#function to capture aggregation framework query
def make_pipeline():
  
    pipeline =              [
                                  {"$group": {"_id" : '$user_screen_name',
                                               "count" : {"$sum" : 1}}},
                                  {"$sort" : {"count" : -1}},
                                  {"$limit" : 20}
                            ]
                        
        
    return pipeline


#function to call aggregation query
def aggregator(db, pipeline):
    result = db.tweets_census.aggregate(pipeline)
    return result



#CALL AGGREGATOR
# if __name__ == '__main__':
    
#     db = get_db('twitter01', 'tweets_census')
#     pipeline = make_pipeline()
#     result = aggregator(db, pipeline)
    
#     pp.pprint(list(result))
    
    
    

#CALL QUERY
if __name__ == '__main__':
    
    db = get_db('twitter01', 'tweets_census')
        
    result = db.tweets_census.find(
                    { "user_location" : {"$exists" : 1},
                      "user_location": {"$ne" : "None"},  
                      "lat_lon_loc" : {"$exists" : 1},
                    },                #'where' clause
                        {"user_location" : 1,
                         "lat_lon_loc" : 1,
                         "_id" : 0},
                                     )
                                     
#                                   {"text_sentiment":{"$exists" : 1}} ,                #'where' clause
#                                   {"text_sentiment.polarity":1,    
#                                    "text_sentiment.polarity_confidence":1,  #'select' clause - "_id":0 remove id field
#                                    "text_sentiment.subjectivity":1,
#                                    "text_sentiment.subjectivity_confidence":1,
#                                    "_id":0}
                                    
        
    for t in result:
#         pp.pprint(t['text_sentiment']['polarity'])
#         pp.pprint(t['user_location'])
        pp.pprint(t)


# Query Library

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

    
#query only
if __name__ == '__main__':
    
    db = get_db('twitter01', 'tweets_census')
        
    result = db.tweets_census.find(
                                  {"text":{"$ne" : "null"}} ,                #'where' clause
                                  {"text":1,                                  #'select' clause - "_id":0 remove id field
                                   "_id":0})

#check to see if text sentiment field is null    
# result = db.tweets_census.find(
#                                     {"text_sentiment":{"$eq" : "null"} },                #'where' clause
#                                     {"text_sentiment":1,                                  #'select' clause - "_id":0 remove id field
#                                        "_id":0}
                                     )

#return fields where text sentiment fields exists
#     result = db.tweets_census.find(
#                                   {"text_sentiment":{"$exists" : 1}} ,                #'where' clause
#                                   {"text_sentiment":1,                                  #'select' clause - "_id":0 remove id field
#                                    "_id":0})

#retrun contents of text sentiment fields
#     result = db.tweets_census.find(
#                                   {"text_sentiment":{"$exists" : 1}} ,                #'where' clause
#                                   {"text_sentiment.polarity":1,    
#                                    "text_sentiment.polarity_confidence":1,  #'select' clause - "_id":0 remove id field
#                                    "text_sentiment.subjectivity":1,
#                                    "text_sentiment.subjectivity_confidence":1,
#                                    "_id":0})
    
    
#     result = db.tweets_census.find(
#                                   {"text_sentiment":{"$exists" : 1}} ,                #'where' clause
#                                   {"text_sentiment":1,
#                                    "_id":0})

    #build list for text analysis
#     tweet_text = []
    
    for t in result:
#         tweet_text.append(t["text"])
        pp.pprint(t)


# Aggregation Library

In [30]:
## LESSON 5
from pymongo import MongoClient

#aggregate uses piping operator ie stages


def tweet_text():
    result = db.tweets_census.aggregate([
                                  { "$match": {"_id" : '$user_screen_name',
                                               "count" : {"$sum" : 1}}},
                                  {"$sort" : {"count" : -1}},
                                  {"$limit" : 10}
                                ])
    return result



def most_tweet():
    result = db.tweets_census.aggregate([
                                  { "$group": {"_id" : '$user_screen_name',
                                               "count" : {"$sum" : 1}}},
                                  {"$sort" : {"count" : -1}},
                                  {"$limit" : 10}
                                ])
    return result
                                               
    
def highest_ratio():
    result = db.tweets_census.aggregate([
                                  {"$match" : { "user_friend": {"$gt" : 0},
                                               "user_follower": {"$gt" : 0}}},
                                  {"$project" : {"ratio" : {"$divide" :["$user_follower",
                                                                        "$user_friend"]},
                                                 "screen_name": "$user_screen_name"}},
                                  {"$sort": {"ratio": -1}},
                                  {"$limit": 1}
                                ])

    
def unique_hashtags_byuser():
    result = db.tweets_census.aggregate([
                                  {"$unwind" : "$ent_hashtag"},
                                  {"$group" : {"_id" : "$user_screen_name",
                                               "unique_hashtags": {
                                                                   "$addToSet":"$ent_hashtag.text"}}},
                                  {"$sort": {"_id": -1}}
                                ])

def user_mentions():
    result = db.tweets_census.aggregate([
                                  {"$unwind" : "$ent_user_mention"},
                                  {"$group" : {"_id" : "$screen_name",
                                               "count": {"$sum":1}}},
                                  {"$sort": {"count": -1}},
                                  {"$limit": 1}
                                ])  
    
 
    
def unique_user_mentions():
    result = db.tweets_census.aggregate([
                                  {"$unwind" : "$ent_user_mention"},
                                  {"$group" : {
                                               "_id" : "$screen_name",
                                               "mset": {
                                                        "$addToSet":"$screen_name"}}},
                                  {"$unwind" : "$mset"},
                                  {"$group" : {"_id": "$_id", "count" : {"$sum" : 1}}},
                                  {"$sort": {"count": -1}},
                                  {"$limit" : 10}
                                ])    



# Topic Modelling

In [87]:
from pymongo import MongoClient 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer



#MongoDB connection
client = MongoClient('YOUR_INFO')
db = client.twitter01
collection = db.tweets_census 


# CHANGE PARAMETERS AS REQUIRED
n_samples = 100
n_features = 1000
n_topics = 10
n_top_words = 20



# create sequence of strings to feed into SKlearn Topic Model
# GSR_text = myDF_GSR.select(['eventDescription_clean']).toPandas()
# GSR_text = list(chain.from_iterable(GSR_text.values.tolist()))

text_result = db.tweets_census.find(
                              {"text_clean":{"$exists" : 1}} ,                #'where' clause
                              {"text_clean":1,
                               "_id":0})

#build list for text analysis
tweet_text = []

for t in text_result:
    tweet_text.append(t["text_clean"])
    #pp.pprint(t)


# output human readable topic content
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    


# create list of topic words for labelling
def list_top_words(model, feature_names, n_top_words):
    list_topic = []
    
    for topic_idx, topic in enumerate(model.components_):
        list_mini_topic = []

        for i in topic.argsort()[:-n_top_words - 1:-1]:
            list_mini_topic.append(feature_names[i])
        
        list_topic.append(list_mini_topic)
        
    return list_topic


    
# MODEL ONE - NMF
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(input='content',
                                   #max_df=0.95, 
                                   #min_df=0.1, 
                                   max_features=n_features,
                                   stop_words='english')


tfidf = tfidf_vectorizer.fit_transform(tweet_text)


print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(input='content',
                                #max_df=0.95, 
                                #min_df=2, 
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(tweet_text)

print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

nmf = NMF(n_components=n_topics, 
          random_state=1, 
          alpha=.1, 
          l1_ratio=.5).fit(tfidf)

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, 
                tfidf_feature_names, 
                n_top_words)


# MODEL 2: LDA
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, 
                                max_iter=5,
                                learning_method='online', 
                                learning_offset=50.,
                                random_state=0).fit(tf)

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, 
                tf_feature_names, 
                n_top_words)



# build list of lists with top n topic words - used in labelling functions to follow
topic_words = list_top_words(lda, 
                tf_feature_names, 
                n_top_words)


Extracting tf-idf features for NMF...
Extracting tf features for LDA...
Fitting the NMF model with tf-idf features,n_samples=100 and n_features=1000...

Topics in NMF model:
Topic #0:
matter lives joevargas yiannopoulos milo simple movement destroys black statistics facts ive hirap houdinilike horse hope japan homelessness homeless holder
Topic #1:
jumbo tune september chhattisgarhodishaampwbengal mpparimal jharkhandk india elephants census rt hfmasynergy homelessness http htt ht housing house hours hell houdinilike
Topic #2:
claim harassing completing canadians canada mdroletglobaltv cdnpoli statistics census htt ht housing horse http https idea house hours houdinilike homeless
Topic #3:
emoji game dm im lol use needs send darkskinchris describes used tweet favorite face thing saying available yazarsn yarasky playing
Topic #4:
shortest links statistics pays money make houdinilike horse hope hit homelessness hours homeless house holder housing hits zozj hispanic hispanics
Topic #5:
rt 