In [1]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
# To add wait time between requests
import time
# For matching string expressions
import re
# For generating summary statistics
import statistics as st

# Define rootpath
rp = 'C:\\Users\\sgmmahon\\Documents\\GitHub\\iom_project\\'
mp = 'methods\\accessing_tweets\\'
dp = 'data\\tweet_data\\' 

# Add Bearer Token as environmental variable
#os.environ['TOKEN'] = 'BEARER-TOKEN-HERE'
os.environ['TOKEN'] = "AAAAAAAAAAAAAAAAAAAAABLt8wAAAAAAbE8U4tJJtQffIvLTtuHWVzm0nqI%3DIinQPSHHGhYjXwG71L4AHkUnUmlP9X8npfKQ0S6Nk07nBSF8AY"

#### Define search terms

In [2]:
# Create functions which concatenates vectors
def cnct (x): return(" OR ".join(x))
def cnctwb (x): return("(" + " OR ".join(x) + ")")

In [3]:
# https://1000mostcommonwords.com/1000-most-common-ukrainian-words/

ua_common_words = ["як", "Я", "його", "що", "він", "було", "для", "на", "є", "еякі", "вони", "бути", "у", "один", "мати", "це", 
                   "від", "по", "гаряча", "слово", "але", "що", "деякі", "вогонь", "це", "ви", "або", "було", "план", "и", "до", 
                   "і", "кішка", "в", "ми", "може", "чере", "другий", "були", "які", "зробити", "їх", "час", "якщо", "буде", "як", 
                   "аначений", "вона", "кожен", "скаати", "робить", "набір", "три", "хотіти", "повітря", "добре", "також", "грати",
                   "невеликої", "кінець", "ставити", "додому", "читати", "рука", "порт", "великий", "аклинань", "додавати", 
                   "навіть", "емля", "тут", "повинні", "великий", "високий", "таких", "слідувати", "акт", "чому", "спитаєте", 
                   "чоловіки", "мінення", "пішов", "світло", "вид", "від", "потрібно", "будинок", "картинка", "спробуйте", "нам", 
                   "ову", "тварин", "точка", "мать", "світ", "рядом", "будувати", "самостійно", "емля", "батько"]

ua_common_words = cnctwb(ua_common_words)

In [444]:
# https://en.openrussian.org/list/all

ru_common_words = ['и', 'в', 'не', 'на', 'что', 'тот', 'быт', 'с', 'а', 'весь', 'как', 'по', 'но', 'э́то', 'к', 'у', 'из', 
                'за', 'так', 'же', 'сказа́ть', 'э́тот', 'кото́рый', 'мочь', 'о', 'челове́к', 'ещё', 'бы', 'тако́й', 'то́лько', 
                'себя́', 'како́й', 'для', 'уже́', 'когда́', 'кто', 'вот', 'да', 'год', 'знать', 'е́сли', 'до', 'говори́ть', 'и́ли', 
                'мой', 'вре́мя', 'рука́', 'са́мый', 'нет', 'ни', 'стать', 'большо́й', 'друго́й', 'свой', 'де́ло', 'под', 'где', 
                'что́бы', 'ну', 'сам', 'есть', 'раз', 'чём', 'там', 'глаз', 'пе́рвый', 'день', 'жизнь', 'тут', 'ничто́', 
                'пото́м', 'о́чень', 'ли', 'при', 'хоте́ть', 'на́до', 'голова́', 'без', 'ви́деть', 'тепе́рь', 'идти́', 'друг', 'сейча́с', 
                'стоя́ть', 'дом', 'то́же', 'по́сле', 'мо́жно', 'сло́во', 'че́рез', 'ме́сто', 'ду́мать', 'здесь', 'спроси́ть', 'лицо́', 
                'тогда́', 'до́лжный', 'ведь', 'но́вый', 'ка́ждый']

ru_common_words = cnctwb(ru_common_words)

#### Define functions

In [4]:
# Define function which retrieves token from the environment
def auth():
    return os.getenv('TOKEN')

# Define function that uses bearer token to create headers used to access the API
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

# Function to create url to make GET request
def create_url(query, start_time, end_time, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {# Required parameters
                    'query': query,
                    'start_time': start_time,
                    'end_time': end_time,
                    'max_results': max_results,
                    # Additional parameters which can be requested optionally
                    'expansions': 'author_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id,geo.place_id,entities.mentions.username',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities,possibly_sensitive',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified,location,url',
                    'place.fields': 'full_name,id,country,country_code,contained_within,geo,name,place_type',
                    # ID to call next page of tweets
                    'next_token': {}}
    return (search_url, query_params)

# Function to make GET request
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

# Function to obtain variable names
def get_var_name(variable):
 for name in globals():
     if eval(name) == variable:
        return(name)


# Function to append results to a csv
def append_to_csv(json_response, fileName):

    # A counter variable
    counter = 0

    # Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    # If at least one tweet called contains geographic information
    if 'places' in json_response['includes']:
        # Create a dataframe of places called within this batch of tweets
        places = pd.DataFrame(columns=['place_id','place_name','full_place_name','bbox','lat','long','exact_coords','place_type','country_code','country'])
        # For loop which appends information about each place to the places dataframe 
        for place in json_response['includes']['places']:
            # Collate all place information as dictionary
            place_data = {'place_id'       :place['id'          ],
                          'place_name'     :place['name'        ],
                          'full_place_name':place['full_name'   ],
                          'bbox'           :[' '.join(str(coord) for coord in place['geo']['bbox' ])],
                          'lat'            :st.mean([place['geo']['bbox'][0],place['geo']['bbox'][2]]),
                          'long'           :st.mean([place['geo']['bbox'][1],place['geo']['bbox'][3]]),
                          'exact_coords'   :False,
                          'place_type'     :place['place_type'  ],
                          'country_code'   :place['country_code'],
                          'country'        :place['country'     ]}
        
            # Convert dictionary to a single-row dataframe
            place_data = pd.DataFrame(place_data)
            # Append place information to the places dataframe
            places = places.append(place_data)
    
        # Drop duplicate places and reset index of places
        places = places.drop_duplicates(subset=['place_id']).reset_index().drop('index',axis=1)
    
    # Create a dataframe of users mentioned in this batch of tweets
    users = pd.DataFrame(columns=['user_id','username','user_name','followers_count','following_count',
                                  'tweet_count','listed_count','user_url','user_loc','user_desc'])
    
    # For loop which appends information about each user to the users dataframe
    for user in json_response['includes']['users']:
        # Collate all user information as dictionary
        user_data = {'user_id'        :user['id'         ],
                     'username'       :user['username'   ],
                     'user_name'      :user['name'       ],
                     'user_url'       :user['url'        ],
                     'user_desc'      :user['description'],
                     'followers_count':user['public_metrics']['followers_count'],
                     'following_count':user['public_metrics']['following_count'],
                     'tweet_count'    :user['public_metrics']['tweet_count'    ],
                     'listed_count'   :user['public_metrics']['listed_count'   ]}
    
        if 'location' in user:
            user_data.update({'user_loc': [user['location']]})
        else:
            user_data.update({'user_loc': [' ']})
        
        # Convert dictionary to a single-row dataframe
        user_data = pd.DataFrame(user_data)
        # Append place information to the places dataframe
        users = users.append(user_data)
    
    # Drop duplicate entries and reset index of users
    users = users.drop_duplicates(subset=['user_id']).reset_index().drop('index',axis=1)
    
    cnt = 0
    
    #Loop through each tweet
    for tweet in json_response['data']:
        
        cnt += 1
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Tweet ID
        tweet_id = tweet['id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Author ID
        author_id = tweet['author_id']
        
        # 4. Geolocation
        
        # If tweet contains geographical information
        if ('geo' in tweet):
            
            # Subset places dataframe to only include information about place mentioned in tweet
            place = places[places['place_id'] == tweet['geo']['place_id']]
            
            # Assign geographical variables unsing place information
            place_id        = tweet['geo']['place_id']
            place_name      = place.loc[place.index[0],'place_name']
            full_place_name = place.loc[place.index[0],'full_place_name']
            bbox            = place.loc[place.index[0],'bbox']
            place_type      = place.loc[place.index[0],'place_type']
            country_code    = place.loc[place.index[0],'country_code']
            country         = place.loc[place.index[0],'country']
            
            # If tweet contains exact coordinates, provide them and assign exact_coords as True
            if ('coordinates' in tweet['geo']):
                lat             = tweet['geo']['coordinates']['coordinates'][0]
                long            = tweet['geo']['coordinates']['coordinates'][1]
                exact_coords    = True
            # If tweet doesn't contains exact coordinates, provide centre of place bounding box and assign exact_coords as False
            else:
                lat             = place.loc[place.index[0],'lat']
                long            = place.loc[place.index[0],'long']
                exact_coords    = False
        
        # If no geographical information provided, assign all geographical variables as blank
        else:
            place_id        = " "
            place_name      = " "
            full_place_name = " "
            bbox            = " "
            place_type      = " "
            country_code    = " "
            country         = " "
            lat             = " "
            long            = " "
            exact_coords    = " "

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count   = tweet['public_metrics']['reply_count']
        like_count    = tweet['public_metrics']['like_count']
        quote_count   = tweet['public_metrics']['quote_count']
        
        # 7. Tweet text
        text = tweet['text']
        
        # 8. Users
        
        # Subset users dataframe to only include information about user mentioned who tweeted
        user = users[users['user_id'] == tweet['author_id']]
        
        # Assign geographical variables unsing place information
        username        = user.loc[user.index[0],'username']
        user_name       = user.loc[user.index[0],'user_name']
        followers_count = user.loc[user.index[0],'followers_count']
        following_count = user.loc[user.index[0],'following_count']
        tweet_count     = user.loc[user.index[0],'tweet_count']
        listed_count    = user.loc[user.index[0],'listed_count']
        user_url        = user.loc[user.index[0],'user_url']
        user_loc        = user.loc[user.index[0],'user_loc']
        user_desc       = user.loc[user.index[0],'user_desc']
        
        # 9. Source
        source = tweet['source']

        # 10. Conversation_id
        conversation_id = tweet['conversation_id']

        # 11. Reply settings
        reply_settings = tweet['reply_settings']

        # 12. Referenced tweets
        if ('referenced_tweets' in tweet):   
            referenced_tweets_type = tweet['referenced_tweets'][0]['type']
            referenced_tweets_id   = tweet['referenced_tweets'][0]['id']
        else:
            referenced_tweets_type = " "
            referenced_tweets_id   = " "

        # 13. In reply to user id
        if ('in_reply_to_user_id' in tweet):  
            in_reply_to_user_id = tweet['in_reply_to_user_id']
        else:
            in_reply_to_user_id = " "
            
        # 14. Entities
        if ('entities' in tweet and 'annotations' in tweet['entities']):
            annotations = tweet['entities']['annotations']
        else:
            annotations = " "
        
        # 15. Mentions
        if ('entities' in tweet and 'mentions' in tweet['entities']):
            mentions = tweet['entities']['mentions']
        else:
            mentions = " "
        
        # 16. URLs
        if ('entities' in tweet and 'urls' in tweet['entities']):
            linked_url = tweet['entities']['urls'][0]['expanded_url']
        else:
            linked_url = " "
            
        # 17. Possibly sensitive
        if('possibly_sensitive' in tweet):
            possibly_sensitive = tweet['possibly_sensitive']
        else:
            possibly_sensitive = " "
        
        # Assemble all data in a list
        res = [tweet_id, created_at, author_id, 
               place_id, place_name, full_place_name, lat, long, exact_coords, bbox, place_type, country_code, country,
               lang, retweet_count, reply_count, like_count, quote_count, text, 
               username, user_name, followers_count, following_count, tweet_count, listed_count, user_url, user_loc, user_desc,
               source, conversation_id, reply_settings, referenced_tweets_type, referenced_tweets_id, in_reply_to_user_id, 
               annotations, mentions, linked_url, possibly_sensitive]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter)


# Function that uses looping to make multiple calls, to ensure the desired number of tweets during each period have been collected 
def call_tweets(keywords,start_list,end_list,max_results,max_count,pathway):
    
    #Inputs for tweets
    bearer_token = auth()
    headers = create_headers(bearer_token)

    #Total number of tweets we collected from the loop
    total_tweets = 0

    # Create file
    csvFile = open(pathway, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    # Create headers for the data you want to save, in this example, we only want save these columns in our dataset
    csvWriter.writerow(['tweet_id', 'created_at', 'author_id', 
                        'place_id', 'place_name', 'full_place_name', 'lat', 'long', 'exact_coords', 'bbox', 'place_type', 'country_code', 'country', 
                        'lang', 'retweet_count', 'reply_count', 'like_count', 'quote_count', 'text', 
                        'username', 'user_name', 'followers_count', 'following_count', 'tweet_count', 'listed_count', 'user_url', 'user_loc', 'user_desc', 
                        'source', 'conversation_id', 'reply_settings', 'referenced_tweets_type', 'referenced_tweets_id', 'in_reply_to_user_id', 
                        'annotations', 'mentions', 'linked_url', 'possibly_sensitive'])
    csvFile.close()
    
    # For loop which calls tweets until the max number per period (max_results) has been reached
    for i in range(0,len(start_list)):
        
        # Inputs
        count = 0 # Counting tweets per time period
        flag = True
        next_token = None
        
        # Check if flag is true
        while flag:
            
            # Check if max_count reached
            if count >= max_count:
                break
            
            # Call tweets
            print("-------------------")
            print("Token: ", next_token)
            url = create_url(keywords, start_list[i],end_list[i], max_results)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
            
            # Get number of tweets called
            result_count = json_response['meta']['result_count']
            
            # If 'next_token' provided
            if 'next_token' in json_response['meta']:
                # Save the token to use for next call
                next_token = json_response['meta']['next_token']
                # Print next_token
                print("Next Token: ", next_token)
                # If results have been successfully called and additional results are ready
                if result_count is not None and result_count > 0 and next_token is not None:
                    # Append results to csv and print progress
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, pathway)
                    count += result_count
                    total_tweets += result_count
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(5)                
            # If no next token exists
            else:
                # If results have been returned
                if result_count is not None and result_count > 0:
                    print("-------------------")
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, pathway, keyword)
                    count += result_count
                    total_tweets += result_count
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(5)
            
                # Since this is the final request, turn flag to false to move to the next time period.
                flag = False
                next_token = None
            time.sleep(5)
                
    # Print total number of tweets called            
    print("Total number of results: ", total_tweets)

In [None]:
# Key elements of twitter V2 API call
#json_response['data']               # Info about tweets
#json_response['includes']['users']  # Info about users mentioned in tweets
#json_response['includes']['places'] # Info about places attached to tweets
#json_response['includes']['tweets'] # Info about tweets that interact with primary tweets (e.g. replies, quotes)
#json_response['errors']             # Errors that occured when calling tweets
#json_response['meta']               # Meta data like newest and oldest tweets, total tweets called and next token if paginating

#### Call Ukraine tweets

In [6]:
# Inputs for the request
keywords_list = ua_common_words + ' -is:retweet place_country:UA'

# Define other input parameters
start_list  = [('2021-07-' + str(i) + 'T01:00:00.000Z') for i in range(1,31)] # Start of search period (can enter list if searching multiple distinct periods)
end_list    = [('2021-07-' + str(i) + 'T13:00:00.000Z') for i in range(1,31)] # End   of search period (can enter list if searching multiple distinct periods)
max_results = 500                          # Max tweets returned per call
max_count   = 300                          # Max tweets called in total per time period
pathway     = rp + dp + "tweets/ukraine_tweets_01072021_01082021.csv" # Where to save / what to call resulting csv file

call_tweets(keywords_list,start_list,end_list,max_results,max_count,pathway)

-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpdj6oycva7db6725ffk4ty6i44m0t
Start Date:  2021-07-1T01:00:00.000Z
# of Tweets added from this response:  491
Total # of Tweets added:  491
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpdj6p6ukstwd3k4y1m7h0u3gtcnst
Start Date:  2021-07-2T01:00:00.000Z
# of Tweets added from this response:  484
Total # of Tweets added:  975
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpdj73znkgncgegeowb1exjj2bl5z1
Start Date:  2021-07-3T01:00:00.000Z
# of Tweets added from this response:  497
Total # of Tweets added:  1472
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpdj7461qqj10njziqm1bdjbyzdzzx
Start Date:  2021-07-4T01:00:00.000Z
# of Tweets added from this response:  497
Total # of Tweets added:  1969
------------------

#### Call Russian tweets

In [None]:
# Inputs for the request
keywords_list = ru_common_words + ' -is:retweet place_country:RU'

call_count += 1

# Define other input parameters
start_list  = ['2021-06-30T13:00:00.000Z','2021-07-01T13:00:00.000Z','2021-07-02T13:00:00.000Z',
               '2021-07-03T13:00:00.000Z','2021-07-04T13:00:00.000Z','2021-07-05T13:00:00.000Z',
               '2021-07-06T13:00:00.000Z','2021-07-07T13:00:00.000Z','2021-07-08T13:00:00.000Z','2021-07-09T13:00:00.000Z'] # Start of search period (can enter list if searching multiple distinct periods)
end_list    = ['2021-07-01T13:00:00.000Z','2021-07-02T13:00:00.000Z','2021-07-03T13:00:00.000Z',
               '2021-07-04T13:00:00.000Z','2021-07-05T13:00:00.000Z','2021-07-06T13:00:00.000Z',
               '2021-07-07T13:00:00.000Z','2021-07-08T13:00:00.000Z','2021-07-09T13:00:00.000Z','2021-07-10T13:00:00.000Z'] # End   of search period (can enter list if searching multiple distinct periods)
max_results = 10                         # Max tweets returned per call
max_count   = 3                         # Max tweets called in total per time period
pathway     = rp + dp + "tweets/russian_tweets_test_" + str(call_count) + ".csv" # Where to save / what to call resulting csv file

call_tweets(keywords_list,start_list[0:1],end_list[0:1],max_results,max_count,pathway)