# Collect Tweets into MongoDB

## Install Python libraries

You may need to restart your Jupyter Notebook instance after installed those libraries.

In [1]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-3.11.2-cp36-cp36m-manylinux2014_x86_64.whl (509 kB)
[K     |████████████████████████████████| 509 kB 11.8 MB/s eta 0:00:01
[?25hInstalling collected packages: pymongo
Successfully installed pymongo-3.11.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!pip install pymongo[srv]

Collecting dnspython<2.0.0,>=1.16.0; extra == "srv"
  Downloading dnspython-1.16.0-py2.py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 12.7 MB/s eta 0:00:01
[?25hInstalling collected packages: dnspython
Successfully installed dnspython-1.16.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
!pip install dnspython

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
!pip install tweepy

Collecting tweepy
  Downloading tweepy-3.9.0-py2.py3-none-any.whl (30 kB)
Collecting requests-oauthlib>=0.7.0
  Downloading requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
Collecting oauthlib>=3.0.0
  Downloading oauthlib-3.1.0-py2.py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 18.5 MB/s eta 0:00:01
Installing collected packages: oauthlib, requests-oauthlib, tweepy
Successfully installed oauthlib-3.1.0 requests-oauthlib-1.3.0 tweepy-3.9.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
!pip install twitter

Collecting twitter
  Downloading twitter-1.18.0-py2.py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 4.0 MB/s  eta 0:00:01
[?25hInstalling collected packages: twitter
Successfully installed twitter-1.18.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


## Import Python libraries

In [6]:
import pymongo
from pymongo import MongoClient
import json
import tweepy
import twitter
from pprint import pprint
import configparser
import pandas as pd

##  Load the Authorization Info

Save database connection info and API Keys in a config.ini file and use the configparse to load the authorization info. 

In [7]:
config = configparser.ConfigParser()
config.read('config.ini')

CONSUMER_KEY      = config['mytwitter']['api_key']
CONSUMER_SECRET   = config['mytwitter']['api_secret']
OAUTH_TOKEN       = config['mytwitter']['access_token']
OATH_TOKEN_SECRET = config['mytwitter']['access_secret']

mongod_connect = config['mymongo']['connection']

## Connect to the MongoDB Cluster

In [8]:
client = MongoClient(mongod_connect)
db = client.gp29 # use or create a database named demo
tweet_collection = db.tweet_collection #use or create a collection named tweet_collection
tweet_collection.create_index([("id", pymongo.ASCENDING)],unique = True) # make sure the collected tweets are unique

'id_1'

## Use the Streaming API to Collect Tweets

Authorize the Stream API 

In [9]:
stream_auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
stream_auth.set_access_token(OAUTH_TOKEN, OATH_TOKEN_SECRET)

strem_api = tweepy.API(stream_auth)

Define the query for the Stream API

In [10]:
track = ['COVID19'] # define the keywords, tweets contain election

locations = [-78.9326449,38.4150904,-78.8816972,38.4450731] #define the location, in COLUMBIA, SC

The collected tweets will contain 'election' <span style="color:red;font-weight:bold"> OR </span> are located in Harrisonburg, VA

In [11]:
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print (status.id_str)
        try:
            tweet_collection.insert_one(status._json)
        except:
            pass
  
    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_data disconnects the stream
            return False
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = strem_api.auth, listener=myStreamListener)
myStream.filter(track=track)#  (locations = locations)   #Use either track or locations

1337081049046769668
1337081049248129030
1337081049784967168
1337081049994686467
1337081050217000960
1337081049877274624
1337081050133049352
1337081050695163907
1337081051823353856
1337081050984493059
1337081052146372611
1337081051789737987
1337081053375328256
1337081054449061890
1337081054939779073
1337081055069794308
1337081055749283842
1337081056089038849
1337081056126590976
1337081056709783552
1337081057724796929
1337081058249076737
1337081057980583936
1337081058702090243
1337081058462928896
1337081058832027649
1337081060144898049
1337081061080256513
1337081061264723973
1337081062707634178
1337081063185805312
1337081063261265924
1337081063793889281
1337081064217595917
1337081064737665024
1337081064913825794
1337081065140350976
1337081066558005251
1337081066427994113
1337081067036139524
1337081067942125576
1337081068227338240
1337081068499988480
1337081069066215429
1337081069145784321
1337081069208649729
1337081069724688386
1337081070181748736
1337081070513172482
1337081070832017408


KeyboardInterrupt: 

## Use the REST API to Collect Tweets

Authorize the REST API 

In [12]:
rest_auth = twitter.oauth.OAuth(OAUTH_TOKEN,OATH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
rest_api = twitter.Twitter(auth=rest_auth)

Define the query for the REST API

In [13]:
count = 100 #number of returned tweets, default and max is 100
geocode = "33.9303881,-81.7697333,100mi"  # defin the location, in Columbia, SC
q = "COVID19"                               #define the keywords, tweets contain election

The collected tweets will contain 'election' <span style="color:red;font-weight:bold"> AND </span> are located in Harrisonburg, VA

In [14]:
search_results = rest_api.search.tweets( count=count,q=q, geocode=geocode) #you can use both q and geocode
statuses = search_results["statuses"]
since_id_new = statuses[-1]['id']
for statuse in statuses:
    try:
        tweet_collection.insert_one(statuse)
        pprint(statuse['created_at'])# print the date of the collected tweets
    except:
        pass

'Thu Dec 10 17:05:21 +0000 2020'
'Thu Dec 10 17:04:41 +0000 2020'
'Thu Dec 10 17:04:17 +0000 2020'
'Thu Dec 10 17:04:11 +0000 2020'
'Thu Dec 10 17:03:38 +0000 2020'
'Thu Dec 10 17:00:01 +0000 2020'
'Thu Dec 10 16:59:48 +0000 2020'
'Thu Dec 10 16:58:53 +0000 2020'
'Thu Dec 10 16:56:56 +0000 2020'
'Thu Dec 10 16:55:14 +0000 2020'
'Thu Dec 10 16:52:42 +0000 2020'
'Thu Dec 10 16:52:24 +0000 2020'
'Thu Dec 10 16:52:09 +0000 2020'
'Thu Dec 10 16:51:16 +0000 2020'
'Thu Dec 10 16:50:26 +0000 2020'
'Thu Dec 10 16:49:42 +0000 2020'
'Thu Dec 10 16:49:30 +0000 2020'
'Thu Dec 10 16:48:43 +0000 2020'
'Thu Dec 10 16:48:07 +0000 2020'
'Thu Dec 10 16:45:22 +0000 2020'
'Thu Dec 10 16:45:02 +0000 2020'
'Thu Dec 10 16:42:56 +0000 2020'
'Thu Dec 10 16:36:50 +0000 2020'
'Thu Dec 10 16:34:49 +0000 2020'
'Thu Dec 10 16:31:11 +0000 2020'
'Thu Dec 10 16:30:24 +0000 2020'
'Thu Dec 10 16:29:49 +0000 2020'
'Thu Dec 10 16:28:33 +0000 2020'
'Thu Dec 10 16:24:43 +0000 2020'
'Thu Dec 10 16:20:55 +0000 2020'
'Thu Dec 1

Continue fetching early tweets with the same query. 
<p><span style="color:red;font-weight:bold">YOU WILL REACH YOUR RATE LIMIT VERY FAST</span></p>

In [15]:
since_id_old = 0
while(since_id_new != since_id_old):
    since_id_old = since_id_new
    search_results = rest_api.search.tweets( count=count,q=q,
                        geocode=geocode, max_id= since_id_new)
    statuses = search_results["statuses"]
    since_id_new = statuses[-1]['id']
    for statuse in statuses:
        try:
            tweet_collection.insert_one(statuse)
            pprint(statuse['created_at']) # print the date of the collected tweets
        except:
            pass

'Thu Dec 10 14:35:49 +0000 2020'
'Thu Dec 10 14:34:57 +0000 2020'
'Thu Dec 10 14:34:48 +0000 2020'
'Thu Dec 10 14:34:44 +0000 2020'
'Thu Dec 10 14:32:51 +0000 2020'
'Thu Dec 10 14:31:34 +0000 2020'
'Thu Dec 10 14:30:19 +0000 2020'
'Thu Dec 10 14:30:06 +0000 2020'
'Thu Dec 10 14:29:02 +0000 2020'
'Thu Dec 10 14:24:39 +0000 2020'
'Thu Dec 10 14:23:59 +0000 2020'
'Thu Dec 10 14:22:46 +0000 2020'
'Thu Dec 10 14:21:13 +0000 2020'
'Thu Dec 10 14:19:34 +0000 2020'
'Thu Dec 10 14:16:24 +0000 2020'
'Thu Dec 10 14:15:11 +0000 2020'
'Thu Dec 10 14:14:01 +0000 2020'
'Thu Dec 10 14:11:59 +0000 2020'
'Thu Dec 10 14:10:56 +0000 2020'
'Thu Dec 10 14:10:13 +0000 2020'
'Thu Dec 10 14:10:08 +0000 2020'
'Thu Dec 10 14:09:55 +0000 2020'
'Thu Dec 10 14:09:30 +0000 2020'
'Thu Dec 10 14:09:29 +0000 2020'
'Thu Dec 10 14:09:15 +0000 2020'
'Thu Dec 10 14:06:44 +0000 2020'
'Thu Dec 10 14:06:06 +0000 2020'
'Thu Dec 10 14:02:31 +0000 2020'
'Thu Dec 10 14:02:08 +0000 2020'
'Thu Dec 10 14:00:12 +0000 2020'
'Thu Dec 1

'Thu Dec 10 01:51:35 +0000 2020'
'Thu Dec 10 01:46:54 +0000 2020'
'Thu Dec 10 01:46:50 +0000 2020'
'Thu Dec 10 01:42:52 +0000 2020'
'Thu Dec 10 01:41:38 +0000 2020'
'Thu Dec 10 01:40:59 +0000 2020'
'Thu Dec 10 01:35:52 +0000 2020'
'Thu Dec 10 01:32:30 +0000 2020'
'Thu Dec 10 01:32:17 +0000 2020'
'Thu Dec 10 01:31:21 +0000 2020'
'Thu Dec 10 01:30:44 +0000 2020'
'Thu Dec 10 01:28:44 +0000 2020'
'Thu Dec 10 01:27:50 +0000 2020'
'Thu Dec 10 01:21:55 +0000 2020'
'Thu Dec 10 01:20:44 +0000 2020'
'Thu Dec 10 01:15:19 +0000 2020'
'Thu Dec 10 01:13:33 +0000 2020'
'Thu Dec 10 01:12:17 +0000 2020'
'Thu Dec 10 01:07:38 +0000 2020'
'Thu Dec 10 01:06:33 +0000 2020'
'Thu Dec 10 01:04:24 +0000 2020'
'Thu Dec 10 01:01:34 +0000 2020'
'Thu Dec 10 01:00:13 +0000 2020'
'Thu Dec 10 00:59:26 +0000 2020'
'Thu Dec 10 00:54:35 +0000 2020'
'Thu Dec 10 00:53:35 +0000 2020'
'Thu Dec 10 00:50:02 +0000 2020'
'Thu Dec 10 00:47:38 +0000 2020'
'Thu Dec 10 00:44:19 +0000 2020'
'Thu Dec 10 00:43:52 +0000 2020'
'Thu Dec 1

KeyboardInterrupt: 

## View the Collected Tweets

Print the number of tweets and unique twitter users

In [16]:
print(tweet_collection.estimated_document_count())# number of tweets collected

user_cursor = tweet_collection.distinct("user.id")
print (len(user_cursor)) # number of unique Twitter users 

216696
175266


Create a text index and print the Tweets containing specific keywords. 

In [17]:
tweet_collection.create_index([("text", pymongo.TEXT)], name='text_index', default_language='english') # create a text index


'text_index'

Create a cursor to query tweets with the created index

In [18]:
tweet_cursor = tweet_collection.find({"$text": {"$search": "vote"}}) # return tweets contain vote

Use pprint to display tweets

In [19]:

for document in tweet_cursor[0:10]: # display the first 10 tweets from the query
    try:
        print ('----')
#         pprint (document) # use pprint to print the entire tweet document
   
        print ('name:', document["user"]["name"]) # user name
        print ('text:', document["text"])         # tweets
    except:
        print ("***error in encoding")
        pass

----
name: Jane West
text: Legal votes- not all votes just the legal votes.
----
name: Hugo
text: Vote by vote, box by box! ✊✊✊
----
name: Levigaming
text: @Rumrunner11 @dkm14 @JoeBiden @KamalaHarris Not popular votes. That's people's votes counted for the election not the popular vote
----
name: Danny
text: @BillOReilly No
Clinton got more votes in 2016, and Biden got more votes in this election. By more than 4 million votes.
----
name: JamesonLaw
text: RT @JamesonHalpern: What about all the sudden single vote ballots, ALL for Biden?  What about he reversal of 6,000 hacked votes from voting…
----
name: #AskWizkid #Election2020
text: RT @SierraRae1316: Like Obama said "voting is not about only voting for perfect person, it's about voting for a step forward."

Remember ne…
----
name: sarah8888
text: RT @vicksiern: @guypbenson WHO DO YOU THINK WON THE ELECTION WITH "LEGAL VOTES" not "illegal votes and corrupt software in voting machines…
----
name: Miss Mam
text: RT @vicksiern: @guypbens

In [20]:
tweet_cursor = tweet_collection.find({"$text": {"$search": "vote"}}) # return tweets contain vote

Use pandas to display tweets

In [21]:
tweet_df = pd.DataFrame(list(tweet_cursor ))
tweet_df[:10] #display the first 10 tweets

Unnamed: 0,_id,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,...,filter_level,lang,timestamp_ms,display_text_range,retweeted_status,extended_tweet,possibly_sensitive,extended_entities,withheld_in_countries,metadata
0,5fa6edf5f7fcb3d0c96b6ab8,Sat Nov 07 18:56:47 +0000 2020,1325150222591406082,1325150222591406082,Legal votes- not all votes just the legal votes.,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,...,low,en,1604775407823,,,,,,,
1,5fa6f2d7f7fcb3d0c96c5eed,Sat Nov 07 19:17:38 +0000 2020,1325155469095153664,1325155469095153664,"Vote by vote, box by box! ✊✊✊","<a href=""http://twitter.com/download/android"" ...",False,,,,...,low,en,1604776658687,,,,,,,
2,5fa6f45cf7fcb3d0c96caac6,Sat Nov 07 19:24:06 +0000 2020,1325157096866738176,1325157096866738176,@Rumrunner11 @dkm14 @JoeBiden @KamalaHarris No...,"<a href=""http://twitter.com/download/android"" ...",False,1.32515e+18,1.32514984607701e+18,39539485.0,...,low,en,1604777046778,"[44, 130]",,,,,,
3,5fa6ef93f7fcb3d0c96bbb7c,Sat Nov 07 19:03:41 +0000 2020,1325151958659821568,1325151958659821568,@BillOReilly No\nClinton got more votes in 201...,"<a href=""http://twitter.com/download/iphone"" r...",False,1.325122e+18,1.3251216526156513e+18,23970102.0,...,low,en,1604775821734,"[13, 120]",,,,,,
4,5fa6eeeaf7fcb3d0c96b9a96,Sat Nov 07 19:00:52 +0000 2020,1325151250342539265,1325151250342539265,RT @JamesonHalpern: What about all the sudden ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,...,low,en,1604775652858,,{'created_at': 'Sat Nov 07 18:55:07 +0000 2020...,,,,,
5,5fa6f629f7fcb3d0c96d04e8,Sat Nov 07 19:31:47 +0000 2020,1325159031183298562,1325159031183298562,"RT @SierraRae1316: Like Obama said ""voting is ...","<a href=""http://twitter.com/download/android"" ...",False,,,,...,low,en,1604777507955,,{'created_at': 'Sat Nov 07 19:27:21 +0000 2020...,,,,,
6,5fa6f1f1f7fcb3d0c96c31e4,Sat Nov 07 19:13:47 +0000 2020,1325154500601024514,1325154500601024514,RT @vicksiern: @guypbenson WHO DO YOU THINK WO...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,...,low,en,1604776427780,,{'created_at': 'Sat Nov 07 19:01:05 +0000 2020...,,,,,
7,5fa6f032f7fcb3d0c96bdab5,Sat Nov 07 19:06:21 +0000 2020,1325152629668909057,1325152629668909057,RT @vicksiern: @guypbenson WHO DO YOU THINK WO...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,...,low,en,1604775981715,,{'created_at': 'Sat Nov 07 19:01:05 +0000 2020...,,,,,
8,5fa6efccf7fcb3d0c96bc6b1,Sat Nov 07 19:04:38 +0000 2020,1325152198435639297,1325152198435639297,RT @LLinWood: Georgia vote will be a truthful ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,...,low,en,1604775878901,,{'created_at': 'Fri Nov 06 16:41:30 +0000 2020...,,,,,
9,5fa6ef88f7fcb3d0c96bb979,Sat Nov 07 19:03:31 +0000 2020,1325151916465115137,1325151916465115137,RT @vicksiern: @guypbenson WHO DO YOU THINK WO...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,...,low,en,1604775811674,,{'created_at': 'Sat Nov 07 19:01:05 +0000 2020...,,,,,


In [None]:
tweet_df["favorite_count"].hist() # create a histogram show the favorite count