# Using Twitter's Academic API to search for tweets (with geotags)

In [18]:
# Importing libraries
import requests
import json
import pandas as pd
from pandas.io.json import json_normalize
from ast import literal_eval

def unnest_json(dataframe, column):
    dataframe_new = json_normalize(dataframe[column].apply(literal_eval))
    return dataframe_new

In [19]:
# Accessing the Academic API 
def connect_to_endpoint(bearer_token, query, next_token=None):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    # add additional parameters as needed
    params = {
        'expansions' : "author_id,referenced_tweets.id,geo.place_id,in_reply_to_user_id,referenced_tweets.id.author_id",
        'tweet.fields' : "attachments,author_id,context_annotations,created_at,entities,public_metrics",
        'user.fields' : "created_at,username,verified,description,entities,id,location,name,public_metrics,url",
        'place.fields' : "contained_within,country,country_code,full_name,geo,id,name,place_type"}
    # replace appropriate start and end times below, in our study, we collected tweets date back to 2013
    if (next_token is not None):
        url = "https://api.twitter.com/2/tweets/search/all?max_results=500&query={}&start_time=2013-01-01T00:00:00Z&end_time=2021-03-31T23:59:59.000Z&next_token={}".format(query, next_token)
    else:
        url = "https://api.twitter.com/2/tweets/search/all?max_results=500&start_time=2013-01-01T00:00:00Z&end_time=2021-03-31T23:59:59.000Z&query={}".format(query)
    response = requests.request("GET", url, params=params, headers=headers)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()
count = 0
result_count = 0
flag = True
# Replace with your own bearer token from your academic project in developer portal
bearer_token = "XXXX"
tmp = pd.DataFrame()
while flag:
    # Replace the count below with the number of Tweets you want to stop at. 
    # Note: running without the count check will result in getting more Tweets
    # that will count towards the Tweet cap
    if count >= 7000000:
        break
    json_response = connect_to_endpoint(bearer_token, '("belt and road" OR "one belt one road" OR "new silk road" OR %23beltandroad OR %23beltandroadinitiative OR %23cpec OR %23obor) lang:en -is:retweet -is:reply has:geo')
    while 'next_token' in json_response['meta']:
        next_token = json_response['meta']['next_token']
        result_count = json_response['meta']['result_count']
        #print(next_token)
        if result_count is not None and result_count > 0 and next_token is not None:
            df_tweet = pd.DataFrame(json_response['data'])
            df_user = pd.DataFrame(json_response['includes']['users'])
            df_places = pd.DataFrame(json_response['includes']['places'])
            df_full = pd.merge(df_tweet, df_user, how = 'left', left_on = 'author_id', right_on = 'id', suffixes=('_tweet', '_user'))
            df_full['geo'] = df_full['geo'].fillna({i: {} for i in df_full.index})
            df_full['geo_id'] =  df_full['geo'].apply(lambda x: x.get('place_id'))
            df_full = pd.merge(df_full, df_places, how = 'outer', left_on = 'geo_id', right_on = 'id', suffixes=('_meta', '_geo'))
            tmp = tmp.append(df_full)
            count += result_count
            print("{} tweets saved in the file".format(count))
            json_response = connect_to_endpoint(bearer_token, '("belt and road" OR "one belt one road" OR "new silk road" OR %23beltandroad OR %23beltandroadinitiative OR %23cpec OR %23obor) lang:en -is:retweet -is:reply has:geo', next_token)
    else:
        flag = False
print("Finished searching for tweets!Total Tweet IDs saved: {}".format(count))
tmp.to_csv("tweets.csv", encoding="utf-8",index=False, escapechar="\r")

500 tweets saved in the file
998 tweets saved in the file
1492 tweets saved in the file
1984 tweets saved in the file
2469 tweets saved in the file
2950 tweets saved in the file
3422 tweets saved in the file
3899 tweets saved in the file
Finished searching for tweets!Total Tweet IDs saved: 3899


In [21]:
len(tmp)

3899

## Finding top ranked hashtags in the tweets. This step could also help extend the list of keywords used to collect tweets (building up the queries by adding top-ranked hashtags)

In [None]:
import advertools
text = tmp.text.values.tolist()
hashtag_summary = advertools.extract_hashtags(text)
top_hashtags = hashtag_summary['top_hashtags'][0:20]
top_hashtags