<a href="https://colab.research.google.com/github/sabinagio/ASOBAL-Hackathon/blob/main/twitter_api_data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter API data collection - proof of concept

In [1]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time

In [2]:
os.environ['TOKEN'] = 'AAAAAAAAAAAAAAAAAAAAAFwFgQEAAAAAOjRNg8PpoZ29uTo2Z%2By0K7DEM3s%3DI1NuBq4xOCsYtDMg8nos58Io1kQKgdW80FSEKaaFcSK69TueFy'

In [3]:
def auth():
    return os.getenv('TOKEN')

In [4]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [5]:
def create_url(keyword, max_results=100):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword, 
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id', 
                    'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld', 
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified', 
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type', 
                    'next_token': {}}
    return (search_url, query_params)

In [6]:
def connect_to_endpoint(url, headers, params, next_token=None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))

    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
        
    return response.json()

In [7]:
#Inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = '''fear OR fearful OR afraid OR scared OR terrified OR worry OR worried OR anxiety OR anxious \
  OR distress OR concern OR dismay OR strain OR stress OR tension -"nothing to fear" -"fear not" -"don't worry" -"no worries" lang:en'''

In [8]:
### Small idea space
# perhaps use "I fear", "My fear" 
# remove items that have "fear not"
# How do we find what people fear? 
# Do not include "nothing to fear"

In [9]:
url = create_url(keyword)
json_response = connect_to_endpoint(url[0], headers, url[1])

Endpoint Response Code: 200


In [10]:
json_response

{'data': [{'entities': {'urls': [{'start': 139,
      'end': 162,
      'url': 'https://t.co/elXp0H8lKe',
      'expanded_url': 'https://www.ireland-live.ie/news/health/895480/former-landscape-gardener-who-never-worried-about-using-sunscreen-promotes-sun-safety-after-finding-tennis-ball-sized-lump-under-his-arm.html?utm_source=dlvr.it&utm_medium=twitter',
      'display_url': 'ireland-live.ie/news/health/89…',
      'images': [{'url': 'https://pbs.twimg.com/news_img/1563050287015038976/oPhQFWf7?format=jpg&name=orig',
        'width': 1200,
        'height': 700},
       {'url': 'https://pbs.twimg.com/news_img/1563050287015038976/oPhQFWf7?format=jpg&name=150x150',
        'width': 150,
        'height': 150}],
      'status': 200,
      'title': 'Former landscape gardener who “never worried about using sunscreen” promotes sun safety after finding tennis ball-sized lump under his arm',
      'unwound_url': 'https://www.ireland-live.ie/news/health/895480/former-landscape-gardener-who-neve

In [11]:
# Figure out how to convert the JSON response to Pandas dataframe
json_response.keys()

dict_keys(['data', 'includes', 'meta'])

In [12]:
json_response["data"][0].keys()

dict_keys(['entities', 'reply_settings', 'conversation_id', 'text', 'created_at', 'public_metrics', 'context_annotations', 'id', 'author_id', 'possibly_sensitive', 'source'])

In [13]:
len(json_response["data"])

10

In [14]:
json_response["data"][0]

{'entities': {'urls': [{'start': 139,
    'end': 162,
    'url': 'https://t.co/elXp0H8lKe',
    'expanded_url': 'https://www.ireland-live.ie/news/health/895480/former-landscape-gardener-who-never-worried-about-using-sunscreen-promotes-sun-safety-after-finding-tennis-ball-sized-lump-under-his-arm.html?utm_source=dlvr.it&utm_medium=twitter',
    'display_url': 'ireland-live.ie/news/health/89…',
    'images': [{'url': 'https://pbs.twimg.com/news_img/1563050287015038976/oPhQFWf7?format=jpg&name=orig',
      'width': 1200,
      'height': 700},
     {'url': 'https://pbs.twimg.com/news_img/1563050287015038976/oPhQFWf7?format=jpg&name=150x150',
      'width': 150,
      'height': 150}],
    'status': 200,
    'title': 'Former landscape gardener who “never worried about using sunscreen” promotes sun safety after finding tennis ball-sized lump under his arm',
    'unwound_url': 'https://www.ireland-live.ie/news/health/895480/former-landscape-gardener-who-never-worried-about-using-sunscreen-prom

> Entity annotations (NER): Entities are comprised of people, places, products, and organizations. Entities are delivered as part of the entity payload section. They are programmatically assigned based on what is explicitly mentioned (named-entity recognition) in the Tweet text.

> Entity annotations are programmatically defined entities that are nested within the entities field and are reflected as annotations in the payload. Each annotation has a confidence score and an indication of where in the Tweet text the entities were identified (start and end fields).

> The entity annotations can have the following types:
> - Person - Barack Obama, Daniel, or George W. Bush
> - Place - Detroit, Cali, or "San Francisco, California"
> - Product - Mountain Dew, Mozilla Firefox
> - Organization - Chicago White Sox, IBM
> - Other - Diabetes, Super Bowl 50

In [29]:
json_response["data"][0]["entities"]

{'urls': [{'start': 139,
   'end': 162,
   'url': 'https://t.co/elXp0H8lKe',
   'expanded_url': 'https://www.ireland-live.ie/news/health/895480/former-landscape-gardener-who-never-worried-about-using-sunscreen-promotes-sun-safety-after-finding-tennis-ball-sized-lump-under-his-arm.html?utm_source=dlvr.it&utm_medium=twitter',
   'display_url': 'ireland-live.ie/news/health/89…',
   'images': [{'url': 'https://pbs.twimg.com/news_img/1563050287015038976/oPhQFWf7?format=jpg&name=orig',
     'width': 1200,
     'height': 700},
    {'url': 'https://pbs.twimg.com/news_img/1563050287015038976/oPhQFWf7?format=jpg&name=150x150',
     'width': 150,
     'height': 150}],
   'status': 200,
   'title': 'Former landscape gardener who “never worried about using sunscreen” promotes sun safety after finding tennis ball-sized lump under his arm',
   'unwound_url': 'https://www.ireland-live.ie/news/health/895480/former-landscape-gardener-who-never-worried-about-using-sunscreen-promotes-sun-safety-after-find

> Context annotations: Derived from the analysis of a Tweet’s text and will include a domain and entity pairing which can be used to discover Tweets on topics that may have been previously difficult to surface. At present, we’re using a list of 80+ domains to categorize Tweets. A CSV file of the available context annotation entities is available for download at our [Github repository](https://github.com/twitterdev/twitter-context-annotations).

In [28]:
json_response["data"][0]["context_annotations"]

[{'domain': {'id': '131',
   'name': 'Unified Twitter Taxonomy',
   'description': 'A taxonomy view into the Semantic Core knowledge graph'},
  'entity': {'id': '847900493514891265',
   'name': 'Sports',
   'description': 'Sports'}},
 {'domain': {'id': '131',
   'name': 'Unified Twitter Taxonomy',
   'description': 'A taxonomy view into the Semantic Core knowledge graph'},
  'entity': {'id': '847903248933502976',
   'name': 'Tennis',
   'description': 'Tennis'}},
 {'domain': {'id': '65',
   'name': 'Interests and Hobbies Vertical',
   'description': 'Top level interests and hobbies groupings, like Food or Travel'},
  'entity': {'id': '850395585941086209',
   'name': 'Beauty',
   'description': 'Beauty'}},
 {'domain': {'id': '67',
   'name': 'Interests and Hobbies',
   'description': 'Interests, opinions, and behaviors of individuals, groups, or cultures; like Speciality Cooking or Theme Parks'},
  'entity': {'id': '855109058793160704',
   'name': 'Sunscreen',
   'description': 'Sunscre

In [16]:
json_response["includes"]['users'][1].keys()

dict_keys(['username', 'created_at', 'name', 'description', 'public_metrics', 'verified', 'id'])

In [17]:
json_response["includes"]['users'][0]

{'username': 'IrelandLiveNew',
 'created_at': '2021-12-08T14:56:07.000Z',
 'name': 'Ireland Live',
 'description': '💻📱 Covering the latest Ireland news and events.  \n⚡️ Powered by Iconic Media Group\n👉 https://t.co/NAHozB5C4G\n📧 email: news@ireland-live.ie',
 'public_metrics': {'followers_count': 142,
  'following_count': 665,
  'tweet_count': 30559,
  'listed_count': 0},
 'verified': False,
 'id': '1468595202189578241'}

In [18]:
len(json_response["includes"]['users'])

11

In [19]:
json_response["meta"].keys()

dict_keys(['newest_id', 'oldest_id', 'result_count', 'next_token'])

In [20]:
json_response["meta"]

{'newest_id': '1563050281319137280',
 'oldest_id': '1563050275308326913',
 'result_count': 10,
 'next_token': 'b26v89c19zqg8o3fpz8l5fb35ijbceu0dqk1129btgnel'}

In [21]:
# Collect tweets into a dataframe
tweets = json_response["data"]
columns_list = list(tweets[0].keys())
columns_list.extend(list(tweets[0]["public_metrics"].keys()))
columns_list.remove("public_metrics")
tweet_data = pd.DataFrame(columns=columns_list)

for tweet in tweets:
  # Extract the public metrics data
  public_metrics = tweet["public_metrics"]
  for key in public_metrics.keys():
    tweet[key] = public_metrics[key]
  tweet.pop("public_metrics")

  # Add tweet to dataframe with unpacked public metrics
  tweet_data = tweet_data.append(tweet, ignore_index=True)

In [22]:
tweet_data

Unnamed: 0,entities,reply_settings,conversation_id,text,created_at,context_annotations,id,author_id,possibly_sensitive,source,retweet_count,reply_count,like_count,quote_count,attachments,referenced_tweets,in_reply_to_user_id
0,"{'urls': [{'start': 139, 'end': 162, 'url': 'h...",everyone,1563050281319137280,Former landscape gardener who “never worried a...,2022-08-26T06:26:41.000Z,"[{'domain': {'id': '131', 'name': 'Unified Twi...",1563050281319137280,1468595202189578241,False,dlvr.it,0,0,0,0,,,
1,"{'urls': [{'start': 109, 'end': 132, 'url': 'h...",everyone,1563050281029406721,RT @IAm_DylanJames: POV: how I handle my brain...,2022-08-26T06:26:41.000Z,,1563050281029406721,65872682,False,Twitter for iPhone,3,0,0,0,{'media_keys': ['16_1563023918830583808']},"[{'type': 'retweeted', 'id': '1563023923981078...",
2,"{'urls': [{'start': 168, 'end': 191, 'url': 'h...",everyone,1563050279964409858,Words myself. withdraw from relationships. The...,2022-08-26T06:26:41.000Z,,1563050279964409858,2172044504,False,Echofon Android,0,0,0,0,{'media_keys': ['3_1563050278190186497']},,
3,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",everyone,1562861550888620033,@News4Laughs @joncoopertweets @CalltoActivism ...,2022-08-26T06:26:41.000Z,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1563050279318454272,1456228008499499008,False,Twitter for Android,0,0,0,0,,"[{'type': 'replied_to', 'id': '156286723012724...",1.34394884078993e+18
4,"{'mentions': [{'start': 3, 'end': 12, 'usernam...",everyone,1563050278550573057,RT @KingoTDF: 10/20\nThe physical damage infli...,2022-08-26T06:26:41.000Z,,1563050278550573057,1496282789498716160,True,Twitter for iPhone,419,0,0,0,,"[{'type': 'retweeted', 'id': '1561000811232219...",
5,,everyone,1563050278206578690,There's no cause for concern,2022-08-26T06:26:41.000Z,,1563050278206578690,1448654585170575360,False,"Cheap Bots, Done Quick!",0,0,0,0,,,
6,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",everyone,1563050278165057538,RT @surreal127clips: taeyong’s life flashing b...,2022-08-26T06:26:41.000Z,"[{'domain': {'id': '65', 'name': 'Interests an...",1563050278165057538,153349060,False,Twitter for Android,18,0,0,0,,"[{'type': 'retweeted', 'id': '1563049697152942...",
7,"{'mentions': [{'start': 3, 'end': 14, 'usernam...",everyone,1563050276994826240,RT @arya_kanti: The one who insulted our worsh...,2022-08-26T06:26:40.000Z,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1563050276994826240,1451203404106289155,False,Twitter for Android,24,0,0,0,,"[{'type': 'retweeted', 'id': '1563032313914884...",
8,"{'mentions': [{'start': 3, 'end': 14, 'usernam...",everyone,1563050275870453760,RT @SilverSupa: Just started school this week ...,2022-08-26T06:26:40.000Z,,1563050275870453760,1562394762954477569,False,Twitter for Android,1,0,0,0,,"[{'type': 'retweeted', 'id': '1562459242220597...",
9,"{'urls': [{'start': 88, 'end': 111, 'url': 'ht...",everyone,1563050275308326913,RT @kurikondeshu: Not afraid cause u r here wi...,2022-08-26T06:26:40.000Z,,1563050275308326913,1104353054768291840,False,Twitter for iPhone,455,0,0,0,{'media_keys': ['3_1562988389128818688']},"[{'type': 'retweeted', 'id': '1562988398716936...",


In [26]:
# Collect user info into a dataframe
users = json_response["includes"]["users"]
columns_list = list(users[0].keys())
columns_list.extend(list(users[0]["public_metrics"].keys()))
columns_list.remove("public_metrics")
user_data = pd.DataFrame(columns=columns_list)

for user in users:
    # Extract the public metrics data
    public_metrics = user["public_metrics"]
    for key in public_metrics.keys():
      user[key] = public_metrics[key]
    user.pop("public_metrics")

    # Add user to dataframe with unpacked public metrics
    user_data = user_data.append(user, ignore_index=True)

In [27]:
user_data

Unnamed: 0,username,created_at,name,description,verified,id,followers_count,following_count,tweet_count,listed_count
0,IrelandLiveNew,2021-12-08T14:56:07.000Z,Ireland Live,💻📱 Covering the latest Ireland news and events...,False,1468595202189578241,142,665,30559,0
1,HeathHog73,2009-08-15T10:21:58.000Z,💡🍒 Heather 🍒 🇩🇰🧜‍♀️,"Beauty lover, Strictly & Footie! Superfan of ❤...",False,65872682,1788,936,17435,73
2,ybomibom,2013-11-03T11:50:15.000Z,bo mi,Roleplayer of Yoon Bomi from A-pink | 93L,False,2172044504,91,85,18718,0
3,JohnHorns1971,2021-11-04T11:53:30.000Z,John Hornsby,Conservative capitalist who loves his country ...,False,1456228008499499008,8,22,193,0
4,News4Laughs,2020-12-29T15:56:44.000Z,Roland Jôintz,News4Laughs® Award winning news Photog & award...,False,1343948840789929986,665,570,43258,2
5,Martawit8,2022-02-23T00:37:19.000Z,Marta,Visit https://t.co/BaTnaGb7IB. https://t.co/8A...,False,1496282789498716160,1640,1254,262167,1
6,lovejoy_lyric,2021-10-14T14:19:13.000Z,lovejoy lyric bot,Lovejoy lyric bot! Questions + Lyric changes: ...,False,1448654585170575360,37,6,14908,0
7,baekwonheart,2010-06-08T09:23:44.000Z,툥f🐳,👑 SJ 👑 EXO 👑 SEVENTEEN 👑 NCT 👑 TAEYEON 👑\n|¦ r...,False,153349060,134,374,86961,1
8,gyanam09642592,2021-10-21T15:07:47.000Z,ज्ञानम দ্বিতীয়,जय मां सरस्वती,False,1451203404106289155,1192,3022,11580,0
9,Sumnao22,2022-08-24T11:02:22.000Z,Ploy FC B 2019 AFAIK,I making profile joining this Twitter account ...,False,1562394762954477569,1,17,136,0


In [24]:
### CODE FROM AN ARTICLE - TO BE USED FOR INSPIRATION IN GETTING RECURRENT REQUESTS

#Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "xbox lang:en"
start_list =    ['2021-01-01T00:00:00.000Z',
                 '2021-02-01T00:00:00.000Z',
                 '2021-03-01T00:00:00.000Z']

end_list =      ['2021-01-31T00:00:00.000Z',
                 '2021-02-28T00:00:00.000Z',
                 '2021-03-31T00:00:00.000Z']
max_results = 500

#Total number of tweets we collected from the loop
total_tweets = 0

# Create file
csvFile = open("data.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
csvFile.close()

for i in range(0,len(start_list)):

    # Inputs
    count = 0 # Counting tweets per time period
    max_count = 100 # Max tweets per time period
    flag = True
    next_token = None
    
    # Check if flag is true
    while flag:
        # Check if max_count reached
        if count >= max_count:
            break
        print("-------------------")
        print("Token: ", next_token)
        url = create_url(keyword, start_list[i],end_list[i], max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']

        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                print("-------------------")
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)
            
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(5)
print("Total number of results: ", total_tweets)

-------------------
Token:  None


TypeError: ignored