# Collect Tweets into MongoDB

## Install Python libraries

You may need to restart your Jupyter Notebook instance after installed those libraries.

In [1]:
!pip install pymongo

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!pip install pymongo[srv]

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
!pip install dnspython

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
!pip install tweepy

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
!pip install twitter

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


## Import Python libraries

In [6]:
import pymongo
from pymongo import MongoClient
import json
import tweepy
import twitter
from pprint import pprint
import configparser
import pandas as pd

##  Load the Authorization Info

Save database connection info and API Keys in a config.ini file and use the configparse to load the authorization info. 

In [7]:
config = configparser.ConfigParser()
config.read('config.ini')

CONSUMER_KEY      = config['mytwitter']['api_key']
CONSUMER_SECRET   = config['mytwitter']['api_secret']
OAUTH_TOKEN       = config['mytwitter']['access_token']
OATH_TOKEN_SECRET = config['mytwitter']['access_secret']

mongod_connect = config['mymongo']['connection']

## Connect to the MongoDB Cluster

In [8]:
client = MongoClient(mongod_connect)
db = client.gp19 # use or create a database named demo
final_collection = db.lab11 #use or create a collection named tweet_collection
final_collection.create_index([("id", pymongo.ASCENDING)],unique = True) # make sure the collected tweets are unique

'id_1'

## Use the Streaming API to Collect Tweets

Authorize the Stream API 

In [None]:
stream_auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
stream_auth.set_access_token(OAUTH_TOKEN, OATH_TOKEN_SECRET)

strem_api = tweepy.API(stream_auth)

Define the query for the Stream API

In [None]:
track = ['election'] # define the keywords, tweets contain election

locations = [-78.9326449,38.4150904,-78.8816972,38.4450731] #defin the location, in Harrisonburg, VA

The collected tweets will contain 'election' <span style="color:red;font-weight:bold"> OR </span> are located in Harrisonburg, VA

In [None]:
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print (status.id_str)
        try:
            tweet_collection.insert_one(status._json)
        except:
            pass
  
    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_data disconnects the stream
            return False
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = strem_api.auth, listener=myStreamListener)
myStream.filter(track=track)#  (locations = locations)   #Use either track or locations

## Use the REST API to Collect Tweets

Authorize the REST API 

In [9]:
rest_auth = twitter.oauth.OAuth(OAUTH_TOKEN,OATH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
rest_api = twitter.Twitter(auth=rest_auth)

Define the query for the REST API

In [10]:
count = 100 #number of returned tweets, default and max is 100
geocode = "35.46984,-97.533904,100mi"  # defin the location, in Harrisonburg, VA
q = "COVID-19"                               #define the keywords, tweets contain election

The collected tweets will contain 'election' <span style="color:red;font-weight:bold"> AND </span> are located in Harrisonburg, VA

In [11]:
search_results = rest_api.search.tweets( count=count,q=q, geocode=geocode) #you can use both q and geocode
statuses = search_results["statuses"]
since_id_new = statuses[-1]['id']
for statuse in statuses:
    try:
        final_collection.insert_one(statuses)
        pprint(statuse['created_at'])# print the date of the collected tweets
    except:
        pass

Continue fetching early tweets with the same query. 
<p><span style="color:red;font-weight:bold">YOU WILL REACH YOUR RATE LIMIT VERY FAST</span></p>

In [12]:
since_id_old = 0
while(since_id_new != since_id_old):
    since_id_old = since_id_new
    search_results = rest_api.search.tweets( count=count,q=q,
                        geocode=geocode, max_id= since_id_new)
    statuses = search_results["statuses"]
    since_id_new = statuses[-1]['id']
    for statuse in statuses:
        try:
            final_collection.insert_one(statuse)
            pprint(statuse['created_at']) # print the date of the collected tweets
        except:
            pass

'Thu Dec 17 19:19:27 +0000 2020'
'Thu Dec 17 19:18:28 +0000 2020'
'Thu Dec 17 19:17:13 +0000 2020'
'Thu Dec 17 19:16:09 +0000 2020'
'Thu Dec 17 19:15:31 +0000 2020'
'Thu Dec 17 19:11:15 +0000 2020'
'Thu Dec 17 19:08:29 +0000 2020'
'Thu Dec 17 19:01:56 +0000 2020'
'Thu Dec 17 19:00:55 +0000 2020'
'Thu Dec 17 19:00:01 +0000 2020'
'Thu Dec 17 19:00:00 +0000 2020'
'Thu Dec 17 18:59:43 +0000 2020'
'Thu Dec 17 18:56:49 +0000 2020'
'Thu Dec 17 18:55:49 +0000 2020'
'Thu Dec 17 18:54:40 +0000 2020'
'Thu Dec 17 18:53:46 +0000 2020'
'Thu Dec 17 18:51:58 +0000 2020'
'Thu Dec 17 18:50:38 +0000 2020'
'Thu Dec 17 18:49:54 +0000 2020'
'Thu Dec 17 18:47:05 +0000 2020'
'Thu Dec 17 18:46:35 +0000 2020'
'Thu Dec 17 18:46:04 +0000 2020'
'Thu Dec 17 18:44:54 +0000 2020'
'Thu Dec 17 18:44:35 +0000 2020'
'Thu Dec 17 18:42:55 +0000 2020'
'Thu Dec 17 18:42:03 +0000 2020'
'Thu Dec 17 18:41:23 +0000 2020'
'Thu Dec 17 18:40:00 +0000 2020'
'Thu Dec 17 18:39:18 +0000 2020'
'Thu Dec 17 18:38:07 +0000 2020'
'Thu Dec 1

'Thu Dec 17 12:27:32 +0000 2020'
'Thu Dec 17 12:22:47 +0000 2020'
'Thu Dec 17 12:22:46 +0000 2020'
'Thu Dec 17 12:21:55 +0000 2020'
'Thu Dec 17 12:20:57 +0000 2020'
'Thu Dec 17 12:17:19 +0000 2020'
'Thu Dec 17 12:14:13 +0000 2020'
'Thu Dec 17 12:13:51 +0000 2020'
'Thu Dec 17 12:10:18 +0000 2020'
'Thu Dec 17 11:58:57 +0000 2020'
'Thu Dec 17 11:55:29 +0000 2020'
'Thu Dec 17 11:55:11 +0000 2020'
'Thu Dec 17 11:55:09 +0000 2020'
'Thu Dec 17 11:54:25 +0000 2020'
'Thu Dec 17 11:53:35 +0000 2020'
'Thu Dec 17 11:39:37 +0000 2020'
'Thu Dec 17 11:38:17 +0000 2020'
'Thu Dec 17 11:30:05 +0000 2020'
'Thu Dec 17 11:26:57 +0000 2020'
'Thu Dec 17 11:24:43 +0000 2020'
'Thu Dec 17 11:17:53 +0000 2020'
'Thu Dec 17 11:15:00 +0000 2020'
'Thu Dec 17 11:12:38 +0000 2020'
'Thu Dec 17 11:12:38 +0000 2020'
'Thu Dec 17 11:08:10 +0000 2020'
'Thu Dec 17 11:00:01 +0000 2020'
'Thu Dec 17 10:50:00 +0000 2020'
'Thu Dec 17 10:47:27 +0000 2020'
'Thu Dec 17 10:35:42 +0000 2020'
'Thu Dec 17 10:16:25 +0000 2020'
'Thu Dec 1

'Wed Dec 16 22:11:59 +0000 2020'
'Wed Dec 16 22:10:52 +0000 2020'
'Wed Dec 16 22:06:12 +0000 2020'
'Wed Dec 16 22:05:36 +0000 2020'
'Wed Dec 16 22:03:06 +0000 2020'
'Wed Dec 16 22:01:10 +0000 2020'
'Wed Dec 16 22:01:09 +0000 2020'
'Wed Dec 16 22:01:03 +0000 2020'
'Wed Dec 16 22:00:00 +0000 2020'
'Wed Dec 16 21:58:05 +0000 2020'
'Wed Dec 16 21:52:55 +0000 2020'
'Wed Dec 16 21:52:21 +0000 2020'
'Wed Dec 16 21:52:14 +0000 2020'
'Wed Dec 16 21:52:02 +0000 2020'
'Wed Dec 16 21:51:30 +0000 2020'
'Wed Dec 16 21:51:20 +0000 2020'
'Wed Dec 16 21:49:22 +0000 2020'
'Wed Dec 16 21:44:15 +0000 2020'
'Wed Dec 16 21:44:06 +0000 2020'
'Wed Dec 16 21:42:56 +0000 2020'
'Wed Dec 16 21:42:50 +0000 2020'
'Wed Dec 16 21:42:42 +0000 2020'
'Wed Dec 16 21:42:21 +0000 2020'
'Wed Dec 16 21:40:25 +0000 2020'
'Wed Dec 16 21:39:52 +0000 2020'
'Wed Dec 16 21:39:25 +0000 2020'
'Wed Dec 16 21:38:27 +0000 2020'
'Wed Dec 16 21:36:09 +0000 2020'
'Wed Dec 16 21:32:45 +0000 2020'
'Wed Dec 16 21:30:28 +0000 2020'
'Wed Dec 1

'Wed Dec 16 01:55:34 +0000 2020'
'Wed Dec 16 01:39:17 +0000 2020'
'Wed Dec 16 01:34:44 +0000 2020'
'Wed Dec 16 01:32:06 +0000 2020'
'Wed Dec 16 01:31:55 +0000 2020'
'Wed Dec 16 01:28:00 +0000 2020'
'Wed Dec 16 01:26:37 +0000 2020'
'Wed Dec 16 01:18:05 +0000 2020'
'Wed Dec 16 01:18:03 +0000 2020'
'Wed Dec 16 01:16:02 +0000 2020'
'Wed Dec 16 01:07:26 +0000 2020'
'Wed Dec 16 01:02:26 +0000 2020'
'Wed Dec 16 01:01:03 +0000 2020'
'Wed Dec 16 00:51:39 +0000 2020'
'Wed Dec 16 00:49:54 +0000 2020'
'Wed Dec 16 00:49:42 +0000 2020'
'Wed Dec 16 00:45:35 +0000 2020'
'Wed Dec 16 00:44:21 +0000 2020'
'Wed Dec 16 00:35:29 +0000 2020'
'Wed Dec 16 00:34:52 +0000 2020'
'Wed Dec 16 00:32:20 +0000 2020'
'Wed Dec 16 00:30:18 +0000 2020'
'Wed Dec 16 00:24:32 +0000 2020'
'Wed Dec 16 00:21:43 +0000 2020'
'Wed Dec 16 00:20:35 +0000 2020'
'Wed Dec 16 00:18:35 +0000 2020'
'Wed Dec 16 00:17:42 +0000 2020'
'Wed Dec 16 00:17:13 +0000 2020'
'Wed Dec 16 00:16:19 +0000 2020'
'Wed Dec 16 00:16:13 +0000 2020'
'Wed Dec 1

'Tue Dec 15 16:56:28 +0000 2020'
'Tue Dec 15 16:54:49 +0000 2020'
'Tue Dec 15 16:54:32 +0000 2020'
'Tue Dec 15 16:53:29 +0000 2020'
'Tue Dec 15 16:52:57 +0000 2020'
'Tue Dec 15 16:47:54 +0000 2020'
'Tue Dec 15 16:47:30 +0000 2020'
'Tue Dec 15 16:45:42 +0000 2020'
'Tue Dec 15 16:42:53 +0000 2020'
'Tue Dec 15 16:40:02 +0000 2020'
'Tue Dec 15 16:38:58 +0000 2020'
'Tue Dec 15 16:37:47 +0000 2020'
'Tue Dec 15 16:36:43 +0000 2020'
'Tue Dec 15 16:33:08 +0000 2020'
'Tue Dec 15 16:30:00 +0000 2020'
'Tue Dec 15 16:30:00 +0000 2020'
'Tue Dec 15 16:29:44 +0000 2020'
'Tue Dec 15 16:29:36 +0000 2020'
'Tue Dec 15 16:28:05 +0000 2020'
'Tue Dec 15 16:26:25 +0000 2020'
'Tue Dec 15 16:19:17 +0000 2020'
'Tue Dec 15 16:18:34 +0000 2020'
'Tue Dec 15 16:16:23 +0000 2020'
'Tue Dec 15 16:15:15 +0000 2020'
'Tue Dec 15 16:12:41 +0000 2020'
'Tue Dec 15 16:12:06 +0000 2020'
'Tue Dec 15 16:12:04 +0000 2020'
'Tue Dec 15 16:10:23 +0000 2020'
'Tue Dec 15 16:09:02 +0000 2020'
'Tue Dec 15 16:07:29 +0000 2020'
'Tue Dec 1

'Tue Dec 15 00:52:36 +0000 2020'
'Tue Dec 15 00:49:25 +0000 2020'
'Tue Dec 15 00:48:56 +0000 2020'
'Tue Dec 15 00:48:35 +0000 2020'
'Tue Dec 15 00:47:10 +0000 2020'
'Tue Dec 15 00:47:07 +0000 2020'
'Tue Dec 15 00:46:41 +0000 2020'
'Tue Dec 15 00:46:33 +0000 2020'
'Tue Dec 15 00:46:08 +0000 2020'
'Tue Dec 15 00:45:40 +0000 2020'
'Tue Dec 15 00:42:22 +0000 2020'
'Tue Dec 15 00:40:00 +0000 2020'
'Tue Dec 15 00:39:06 +0000 2020'
'Tue Dec 15 00:39:05 +0000 2020'
'Tue Dec 15 00:30:00 +0000 2020'
'Tue Dec 15 00:29:29 +0000 2020'
'Tue Dec 15 00:28:14 +0000 2020'
'Tue Dec 15 00:26:04 +0000 2020'
'Tue Dec 15 00:24:43 +0000 2020'
'Tue Dec 15 00:17:52 +0000 2020'
'Tue Dec 15 00:16:21 +0000 2020'
'Tue Dec 15 00:16:17 +0000 2020'
'Tue Dec 15 00:08:13 +0000 2020'
'Tue Dec 15 00:05:48 +0000 2020'
'Tue Dec 15 00:04:24 +0000 2020'
'Tue Dec 15 00:03:36 +0000 2020'
'Mon Dec 14 23:59:07 +0000 2020'
'Mon Dec 14 23:58:12 +0000 2020'
'Mon Dec 14 23:58:02 +0000 2020'
'Mon Dec 14 23:55:32 +0000 2020'
'Mon Dec 1

'Mon Dec 14 19:44:34 +0000 2020'
'Mon Dec 14 19:44:27 +0000 2020'
'Mon Dec 14 19:44:12 +0000 2020'
'Mon Dec 14 19:44:12 +0000 2020'
'Mon Dec 14 19:43:53 +0000 2020'
'Mon Dec 14 19:43:03 +0000 2020'
'Mon Dec 14 19:42:56 +0000 2020'
'Mon Dec 14 19:42:36 +0000 2020'
'Mon Dec 14 19:42:29 +0000 2020'
'Mon Dec 14 19:42:18 +0000 2020'
'Mon Dec 14 19:42:10 +0000 2020'
'Mon Dec 14 19:41:42 +0000 2020'
'Mon Dec 14 19:41:02 +0000 2020'
'Mon Dec 14 19:40:59 +0000 2020'
'Mon Dec 14 19:40:21 +0000 2020'
'Mon Dec 14 19:40:07 +0000 2020'
'Mon Dec 14 19:37:59 +0000 2020'
'Mon Dec 14 19:36:56 +0000 2020'
'Mon Dec 14 19:34:46 +0000 2020'
'Mon Dec 14 19:34:08 +0000 2020'
'Mon Dec 14 19:34:02 +0000 2020'
'Mon Dec 14 19:33:31 +0000 2020'
'Mon Dec 14 19:32:27 +0000 2020'
'Mon Dec 14 19:31:28 +0000 2020'
'Mon Dec 14 19:28:41 +0000 2020'
'Mon Dec 14 19:28:31 +0000 2020'
'Mon Dec 14 19:28:04 +0000 2020'
'Mon Dec 14 19:26:23 +0000 2020'
'Mon Dec 14 19:25:22 +0000 2020'
'Mon Dec 14 19:24:54 +0000 2020'
'Mon Dec 1

KeyboardInterrupt: 

## View the Collected Tweets

Print the number of tweets and unique twitter users

In [14]:
print(final_collection.estimated_document_count())# number of tweets collected

user_cursor = final_collection.distinct("user.id")
print (len(user_cursor)) # number of unique Twitter users 

8721
5328


Create a text index and print the Tweets containing specific keywords. 

In [15]:
final_collection.create_index([("text", pymongo.TEXT)], name='text_index', default_language='english') # create a text index


'text_index'

Create a cursor to query tweets with the created index

In [16]:
tweet_cursor = final_collection.find({"$text": {"$search": "covid19"}}) # return tweets contain vote

Use pprint to display tweets

In [17]:

for document in tweet_cursor[0:10]: # display the first 10 tweets from the query
    try:
        print ('----')
#         pprint (document) # use pprint to print the entire tweet document
   
        print ('name:', document["user"]["name"]) # user name
        print ('text:', document["text"])         # tweets
    except:
        print ("***error in encoding")
        pass

----
name: Staffordshire Live
text: #COVID19 https://t.co/ih0WEGjZwY
----
name: Patricia A. McGowan
text: #COVID19 ...#Virginia
https://t.co/CveNaiMFzw
----
name: Staffordshire Live
text: #prison #COVID19 https://t.co/9I1yUKcLPf
----
name: Patricia A. McGowan
text: #COVID19 ...#Virginia 
https://t.co/hd3kTt12Uu
----
name: Staffordshire Live
text: #COVID19 #COVIDfines #lockdown https://t.co/wAlkAF9pUx
----
name: Staffordshire Live
text: Today's #COVID19 update https://t.co/8oD78jA9Og
----
name: 🏳️‍🌈🇺🇸⚖️🗝☀️💡🌊
text: @JordanChariton @natespuewell @RandPaul had #COVID19...
https://t.co/jSavxzW31v
----
name: Patricia A. McGowan
text: #COVID19 testing ... #Virginia #Chesterfield 
https://t.co/NCQxIa7kp5
----
name: Patricia A. McGowan
text: #COVID19 in #Virginia now ... Please #WearAMask 
https://t.co/FsYej81aJj
----
name: VCU Internal Medicine Clerkship
text: RT @VCUHealth: As protective measures against #COVID19 continue, we're here to answer some of your questions on how to celebrate the #h

In [18]:
tweet_cursor = final_collection.find({"$text": {"$search": "covid19"}}) # return tweets contain vote

Use pandas to display tweets

In [None]:
tweet_df = pd.DataFrame(list(tweet_cursor ))
tweet_df[:10] #display the first 10 tweets

In [None]:
tweet_df["favorite_count"].hist() # create a histogram show the favorite count