In [1]:
# Collect relevant tweets through the Twitter API.
import json
import tweepy as tw

In [2]:
# IMPORTANT: enter proper access credential in config_twitter.py file
import config_twitter

In [3]:
# function to establish an initial API connection, respecting the rate limit
def connect_api_client():
    auth = tw.OAuthHandler(config_twitter.consumer_key, config_twitter.consumer_secret)
    auth.set_access_token(config_twitter.access_token, config_twitter.access_token_secret)
    # https://docs.tweepy.org/en/stable/getting_started.html#api
    api = tw.API(auth, wait_on_rate_limit=True)
    try:
        # returns False if credentials could not be verified
        # https://docs.tweepy.org/en/stable/api.html#API.verify_credentials
        api.verify_credentials()
        user = api.verify_credentials()
        if not user:
            raise("Credentials could not be verified: Please check config.py")
        print(f"Connected to Twitter API as {user.name}")
    except Exception as e:
        raise e
    return api

In [4]:
api = connect_api_client()

Connected to Twitter API as test_taro


In [5]:
# construct a search query
query = 'marry OR "married" OR "divorce" OR "divorced" OR "wife" OR "husband" -filter:retweets'

In [6]:
# decide how many tweets to query
###TODO increase this value later to collect a good dataset (try 2000 for instance)
ntweets = 10000

In [7]:
# search and collect relevant tweets
# https://docs.tweepy.org/en/stable/cursor_tutorial.html
# https://docs.tweepy.org/en/stable/code_snippet.html
tweets = [tweet._json for tweet in tw.Cursor(api.search_tweets, q=query, lang="en", tweet_mode='extended').items(ntweets)]
len(tweets)

Rate limit reached. Sleeping for: 816
Rate limit reached. Sleeping for: 822
Rate limit reached. Sleeping for: 819


10000

In [8]:
# example tweet content (json structure)
tweets[0]

{'created_at': 'Sun Jul 31 16:08:51 +0000 2022',
 'id': 1553774700660232192,
 'id_str': '1553774700660232192',
 'full_text': "Ask Netflix, and another streaming service to pick up The Time Traveler's Wife - Sign the Petition! https://t.co/09mcPR7pqH via @UKChange",
 'truncated': False,
 'display_text_range': [0, 137],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'UKChange',
    'name': 'Change.org UK',
    'id': 397998902,
    'id_str': '397998902',
    'indices': [128, 137]}],
  'urls': [{'url': 'https://t.co/09mcPR7pqH',
    'expanded_url': 'https://chng.it/hcqWdQ4J',
    'display_url': 'chng.it/hcqWdQ4J',
    'indices': [100, 123]}]},
 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_scre

In [9]:
# save tweets data to json file
file_out = f"raw_tweet_data_{ntweets}.json"
with open(file_out, mode='w') as f:
    f.write(json.dumps(tweets, indent=2))