In [1]:
%load_ext dotenv
%dotenv 

# Init

### Imports

In [3]:
import tweepy
from pymongo import MongoClient
import os

### Logging

In [4]:
import logging

logger = logging.getLogger("tweepy")
logger.setLevel(logging.DEBUG)
handler = logging.FileHandler(filename="tweepy.log")
logger.addHandler(handler)

### Connect to MongoDB Server

In [5]:
mongo_conn = MongoClient(os.environ["MONGO_CONN"])
# create a twitter_demo database
db = mongo_conn.twitter_demo

# Get random sample of users

In [6]:
from pymongo import ASCENDING

In [7]:
# create a collection for the users inside the mongodb database
user_collection = db.user_collection
user_collection.create_index([('id', ASCENDING)], unique=True)

'id_1'

Subclass `StreamingClient` which receives the real-time incoming Tweets and saves the authors into the MongoDB Collection we created 

In [32]:
class UserSaverClient(tweepy.StreamingClient):
        
    def on_includes(self, includes):
        print(includes)
        user_collection.insert_one(includes['users'][0].data)

In [33]:
streaming_client = UserSaverClient(bearer_token=os.environ["BEARER_TOKEN"], return_type=dict, max_retries=5)

### Create a filter rule

`-is:retweet` excludes retweets

`lang:en` filters for english tweets

`followers_count:50` filters for twitter accounts that have at least 50 followers

In [32]:
streaming_client.add_rules(tweepy.StreamRule("pink -is:retweet lang:en followers_count:50"))

Response(data=[StreamRule(value='pink -is:retweet lang:en followers_count:50', tag=None, id='1612538568815116289')], includes={}, errors=[], meta={'sent': '2023-01-09T19:55:30.168Z', 'summary': {'created': 1, 'not_created': 0, 'valid': 1, 'invalid': 0}})

In [34]:
streaming_client.get_rules()

{'data': [{'id': '1612538568815116289',
   'value': 'pink -is:retweet lang:en followers_count:50'}],
 'meta': {'sent': '2023-01-09T20:38:11.124Z', 'result_count': 1}}

In [35]:
# streaming_client.delete_rules('1612532159331401728')

### Filter
stop after some time

In [35]:
streaming_client.filter(expansions='author_id')

{'users': [<User id=29324689 name=Greg username=RealistSpeaking>]}
{'users': [<User id=1385696951224602625 name=Nietzschean Bugs Bunny username=whiteracemvp>]}
{'users': [<User id=2446458362 name=Andrew Smith username=theeandrewsmith>, <User id=1188197814205140992 name=dubs username=wannabe_idiot>, <User id=220484646 name=#GayWeHo West Hollywood Los Angeles @WeHoLove username=WeHoLove>]}
{'users': [<User id=131686901 name=(꒪▿꒪) everything is terrible username=sacaitlin>, <User id=28960709 name=Mark Julio 『マークマン』 username=MarkMan23>, <User id=3182648054 name=Shibuya Kaho username=Shibukaho>]}
{'users': [<User id=1334996274328850434 name=~Stephanie~ username=Stephanie_9226>, <User id=1224985201933197312 name=Giantess Elizabeth username=FemaleMacro>]}
{'users': [<User id=1308415935737454593 name=ʚ chellica ɞ username=offichell>, <User id=1307486668987994114 name=nuc(leo)tide username=leo_mastix>]}
{'users': [<User id=1458382284999241728 name=ً username=hyiegfs>, <User id=15443710689028177

KeyboardInterrupt: 

In [8]:
user_collection.estimated_document_count()

7

In [9]:
for document in user_collection.find():
    print(document)

{'_id': ObjectId('63bc7b418068e9e73f2d564b'), 'id': '29324689', 'name': 'Greg', 'username': 'RealistSpeaking'}
{'_id': ObjectId('63bc7b438068e9e73f2d564c'), 'id': '1385696951224602625', 'name': 'Nietzschean Bugs Bunny', 'username': 'whiteracemvp'}
{'_id': ObjectId('63bc7b438068e9e73f2d564d'), 'id': '2446458362', 'name': 'Andrew Smith', 'username': 'theeandrewsmith'}
{'_id': ObjectId('63bc7b448068e9e73f2d564e'), 'id': '131686901', 'name': '(꒪▿꒪) everything is terrible', 'username': 'sacaitlin'}
{'_id': ObjectId('63bc7b448068e9e73f2d564f'), 'id': '1334996274328850434', 'name': '~Stephanie~', 'username': 'Stephanie_9226'}
{'_id': ObjectId('63bc7b458068e9e73f2d5650'), 'id': '1308415935737454593', 'name': 'ʚ chellica ɞ', 'username': 'offichell'}
{'_id': ObjectId('63bc7b468068e9e73f2d5651'), 'id': '1458382284999241728', 'name': 'ً', 'username': 'hyiegfs'}


## Get Tweets of those users

In [10]:
# create a collection for the tweets
tweets_collection = db.tweets_collection
tweets_collection.create_index([('id', ASCENDING)], unique=True)
tweets_collection.create_index([('author_id', ASCENDING)], unique=False)

'id_1'

In [23]:
# create a tweepy client
api = tweepy.Client(bearer_token=os.environ["BEARER_TOKEN"])

Get 100 tweets per user from their timeline. Additional fields to pull: creation time, public metrics.

In [12]:
res = api.get_users_tweets(id='29324689', max_results=100, tweet_fields=['created_at', 'public_metrics'])

In [26]:
for tweet in res.data:
    # print(tweet.data)
    data = tweet.data
    data['author_id'] = '29324689'
    print(data)

{'id': '1612698644666552322', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0, 'impression_count': 53}, 'created_at': '2023-01-10T06:31:34.000Z', 'text': 'stay mad 😎\U0001faf5🏻 https://t.co/OgsGfh1qP4', 'edit_history_tweet_ids': ['1612698644666552322'], '_id': ObjectId('63bd2c3dc087e1a6a40bb2ff'), 'author_id': '29324689'}
{'id': '1612553384325959681', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0, 'impression_count': 22}, 'created_at': '2023-01-09T20:54:21.000Z', 'text': 'OMG im so sorry but you need to work harder', 'edit_history_tweet_ids': ['1612553384325959681'], '_id': ObjectId('63bd2c3dc087e1a6a40bb300'), 'author_id': '29324689'}
{'id': '1612552947522736128', 'public_metrics': {'retweet_count': 1534, 'reply_count': 0, 'like_count': 0, 'quote_count': 0, 'impression_count': 0}, 'created_at': '2023-01-09T20:52:37.000Z', 'text': 'RT @billboardcharts: The Billboard Global Excl. U.S. top 10 (chart d

In [30]:
for user in user_collection.find():
    uid = user['id']
    res = api.get_users_tweets(id=uid, max_results=100, tweet_fields=['created_at', 'public_metrics'])
    for tweet in res.data:
        data = tweet.data
        data['author_id'] = uid
        tweets_collection.insert_one(data)

In [31]:
tweets_collection.estimated_document_count()

688

In [38]:
# filter tweets of a specific user
for tweet in tweets_collection.find({"author_id":'131686901'}):
    print(tweet)

{'_id': ObjectId('63bd2e88c087e1a6a40bb737'), 'public_metrics': {'retweet_count': 5, 'reply_count': 0, 'like_count': 0, 'quote_count': 0, 'impression_count': 0}, 'text': 'RT @Kemelwoo: https://t.co/p6qpkIg2gK', 'id': '1612735773803008009', 'created_at': '2023-01-10T08:59:06.000Z', 'edit_history_tweet_ids': ['1612735773803008009'], 'author_id': '131686901'}
{'_id': ObjectId('63bd2e88c087e1a6a40bb738'), 'public_metrics': {'retweet_count': 63, 'reply_count': 0, 'like_count': 0, 'quote_count': 0, 'impression_count': 0}, 'text': 'RT @timeextension64: "Mario, where are you?!!!"\n\n"Mario Bros., new from Atari!"\n\nhttps://t.co/WX2t5tDW56 https://t.co/ydFLWbvuvE', 'id': '1612735059781615618', 'created_at': '2023-01-10T08:56:16.000Z', 'edit_history_tweet_ids': ['1612735059781615618'], 'author_id': '131686901'}
{'_id': ObjectId('63bd2e88c087e1a6a40bb739'), 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0, 'impression_count': 22}, 'text': '@shangy629 楽しみ