#### Import packages

In [1]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
from datetime import timedelta, date
# To add wait time between requests
import time
# For matching string expressions
import re
# For generating summary statistics
import statistics as st
# For loading in locally saved environmental variables
from dotenv import load_dotenv
# For sending SMS messages via the Twilio API
from twilio.rest import Client

# Load in environmental variables
load_dotenv()

# Define Twilio credentials and client
account_sid = os.environ['TWILIO_ACCOUNT_SID']
auth_token  = os.environ['TWILIO_AUTH_TOKEN']
client = Client(account_sid, auth_token)

# Define rootpaths
rp  = 'C:\\Users\\sgmmahon\\Documents\\GitHub\\iom_project\\'
rp2 = 'C:\\Users\\sgmmahon\\OneDrive - The University of Liverpool\\PhD\\Data\\Twitter Data\\'
mp  = 'methods\\accessing_tweets\\'
dp  = 'data\\tweet_data\\'

#### Define search terms

In [2]:
# Create functions which concatenates vectors
def cnct (x): return(" OR ".join(x))
def cnctwb (x): return("(" + " OR ".join(x) + ")")

In [3]:
# The query for this search is a list of the 100 most common Ukrainian words
# https://1000mostcommonwords.com/1000-most-common-ukrainian-words/

ua_common_words = ["як", "Я", "його", "що", "він", "було", "для", "на", "є", "еякі", "вони", "бути", "у", "один", "мати", "це", 
                   "від", "по", "гаряча", "слово", "але", "що", "деякі", "вогонь", "це", "ви", "або", "було", "план", "и", "до", 
                   "і", "кішка", "в", "ми", "може", "чере", "другий", "були", "які", "зробити", "їх", "час", "якщо", "буде", "як", 
                   "аначений", "вона", "кожен", "скаати", "робить", "набір", "три", "хотіти", "повітря", "добре", "також", "грати",
                   "невеликої", "кінець", "ставити", "додому", "читати", "рука", "порт", "великий", "аклинань", "додавати", 
                   "навіть", "емля", "тут", "повинні", "великий", "високий", "таких", "слідувати", "акт", "чому", "спитаєте", 
                   "чоловіки", "мінення", "пішов", "світло", "вид", "від", "потрібно", "будинок", "картинка", "спробуйте", "нам", 
                   "ову", "тварин", "точка", "мать", "світ", "рядом", "будувати", "самостійно", "емля", "батько"]

ua_common_words = cnctwb(ua_common_words)

#### Define functions

In [4]:
# Function to create a list of date objects between two specified dates
def daterange(date1, date2):
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)

# Function to create list of string datetimes in a format that the Twitter API can accept
def get_datetimes(start_dt, end_dt, time):
    
    # Create empty list to populate with datetime strings
    datetimes = []
    
    # Append dates with the specified time attached
    for dt in daterange(start_dt, end_dt):
        datetimes.append(dt.strftime("%Y-%m-%d") + time)
    
    # Return list of datetime strings
    return(datetimes)

# Define function which retrieves twitter token from the environment
def auth():
    return os.getenv('TWITTER_TOKEN')

# Define function that uses bearer token to create headers used to access the API
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

# Function to create url to make GET request
def create_url(query, start_time, end_time, max_results = 10, user_nm = None):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    # If not a username seach, make call using 'common_words' defined above
    if user_nm == None:
        # Define params you want returned
        query_params = {# Required parameters
            'query': query,
            'start_time': start_time,
            'end_time': end_time,
            'max_results': max_results,
            # Additional parameters which can be requested optionally
            'expansions': 'author_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id,geo.place_id,entities.mentions.username',
            'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities,possibly_sensitive',
            'user.fields': 'id,name,username,created_at,description,public_metrics,verified,location,url',
            'place.fields': 'full_name,id,country,country_code,contained_within,geo,name,place_type',
            # ID to call next page of tweets
            'next_token': {}}
    # If a username seach, make call using the username
    else:
        # Define params you want returned
        query_params = {# Required parameters
            'query': '(from:' + query + ' -is:retweet)',
            'start_time': start_time,
            'end_time': end_time,
            'max_results': max_results,
            # Additional parameters which can be requested optionally
            'expansions': 'author_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id,geo.place_id,entities.mentions.username',
            'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities,possibly_sensitive',
            'user.fields': 'id,name,username,created_at,description,public_metrics,verified,location,url',
            'place.fields': 'full_name,id,country,country_code,contained_within,geo,name,place_type',
            # ID to call next page of tweets
            'next_token': {}}
    return (search_url, query_params)

# Function to make GET request
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

# Function to obtain variable names
def get_var_name(variable):
 for name in globals():
     if eval(name) == variable:
        return(name)


# Function to append results to a csv
def append_to_csv(json_response, fileName):

    # A counter variable
    counter = 0

    # Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    # If at least one tweet called contains geographic information
    if 'places' in json_response['includes']:
        # Create a dataframe of places called within this batch of tweets
        places = pd.DataFrame(columns=['place_id','place_name','full_place_name','bbox','lat','long','place_type','country_code','country'])
        # For loop which appends information about each place to the places dataframe 
        for place in json_response['includes']['places']:
            # Collate all place information as dictionary
            place_data = {'place_id'       :place['id'          ],
                          'place_name'     :place['name'        ],
                          'full_place_name':place['full_name'   ],
                          'bbox'           :[' '.join(str(coord) for coord in place['geo']['bbox' ])],
                          'lat'            :st.mean([place['geo']['bbox'][0],place['geo']['bbox'][2]]),
                          'long'           :st.mean([place['geo']['bbox'][1],place['geo']['bbox'][3]]),
                          'place_type'     :place['place_type'  ],
                          'country_code'   :place['country_code'],
                          'country'        :place['country'     ]}
        
            # Convert dictionary to a single-row dataframe
            place_data = pd.DataFrame(place_data)
            # Append place information to the places dataframe
            places = places.append(place_data)
    
        # Drop duplicate places and reset index of places
        places = places.drop_duplicates(subset=['place_id']).reset_index().drop('index',axis=1)
        
    #print(places)
    
    # Create a dataframe of users mentioned in this batch of tweets
    users = pd.DataFrame(columns=['user_id','username','user_name','followers_count','following_count',
                                  'tweet_count','listed_count','user_url','user_loc','user_desc'])
    
    # For loop which appends information about each user to the users dataframe
    for user in json_response['includes']['users']:
        # Collate all user information as dictionary
        user_data = {'user_id'        :user['id'         ],
                     'username'       :user['username'   ],
                     'user_name'      :user['name'       ],
                     'user_url'       :user['url'        ],
                     'user_desc'      :user['description'],
                     'followers_count':user['public_metrics']['followers_count'],
                     'following_count':user['public_metrics']['following_count'],
                     'tweet_count'    :user['public_metrics']['tweet_count'    ],
                     'listed_count'   :user['public_metrics']['listed_count'   ]}
    
        if 'location' in user:
            user_data.update({'user_loc': [user['location']]})
        else:
            user_data.update({'user_loc': [' ']})
        
        # Convert dictionary to a single-row dataframe
        user_data = pd.DataFrame(user_data)
        # Append place information to the places dataframe
        users = users.append(user_data)
    
    # Drop duplicate entries and reset index of users
    users = users.drop_duplicates(subset=['user_id']).reset_index().drop('index',axis=1)
    
    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Tweet ID
        tweet_id = tweet['id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Author ID
        author_id = tweet['author_id']
        
        # 4. Geolocation
        
        # If tweet contains geographical information
        if ('geo' in tweet):
            
            # Subset places dataframe to only include information about place mentioned in tweet
            place = places[places['place_id'] == tweet['geo']['place_id']]
            
            # Check whether place ID tagged in tweet is included in places listed in the response
            # (if not, place will have no rows, so can't be subsetted)
            try:
                place.loc[place.index[0],:]
            # If selecting the first row place causes an error...
            except:
                # Keep tweet ID and assign all other variables as empty
                place_id        = tweet['geo']['place_id']
                place_name      = ''
                full_place_name = ''
                bbox            = ''
                place_type      = ''
                country_code    = ''
                country         = ''
            # If place is includes in json response's listed places...
            else:
                # Assign geographical variables using place information.
                place_id        = tweet['geo']['place_id']
                place_name      = place.loc[place.index[0],'place_name']
                full_place_name = place.loc[place.index[0],'full_place_name']
                bbox            = place.loc[place.index[0],'bbox']
                place_type      = place.loc[place.index[0],'place_type']
                country_code    = place.loc[place.index[0],'country_code']
                country         = place.loc[place.index[0],'country']
            
            
            
            # If tweet contains exact coordinates, provide them and assign exact_coords as True
            if ('coordinates' in tweet['geo']):
                lat             = tweet['geo']['coordinates']['coordinates'][0]
                long            = tweet['geo']['coordinates']['coordinates'][1]
                exact_coords    = True
            # If tweet doesn't contains exact coordinates, provide centre of place bounding box and assign exact_coords as False
            else:
                # Check whether place ID tagged in tweet is included in places listed in the response
                # (if not, place will have no rows, so can't be subsetted)
                try:
                    place.loc[place.index[0],:]
                # If selecting the first row of place causes an error, assign all variables as empty
                except:
                    lat             = ''
                    long            = ''
                    exact_coords    = False
                # If place is includes in json response's listed places, assign geographical variables
                else:
                    lat             = place.loc[place.index[0],'lat']
                    long            = place.loc[place.index[0],'long']
                    exact_coords    = False
        
        # If no geographical information provided, assign all geographical variables as blank
        else:
            place_id        = " "
            place_name      = " "
            full_place_name = " "
            bbox            = " "
            place_type      = " "
            country_code    = " "
            country         = " "
            lat             = " "
            long            = " "
            exact_coords    = " "

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count   = tweet['public_metrics']['reply_count']
        like_count    = tweet['public_metrics']['like_count']
        quote_count   = tweet['public_metrics']['quote_count']
        
        # 7. Tweet text
        text = tweet['text']
        
        # 8. Users
        
        # Subset users dataframe to only include information about user mentioned who tweeted
        user = users[users['user_id'] == tweet['author_id']]
        
        # Assign geographical variables unsing place information
        username        = user.loc[user.index[0],'username']
        user_name       = user.loc[user.index[0],'user_name']
        followers_count = user.loc[user.index[0],'followers_count']
        following_count = user.loc[user.index[0],'following_count']
        tweet_count     = user.loc[user.index[0],'tweet_count']
        listed_count    = user.loc[user.index[0],'listed_count']
        user_url        = user.loc[user.index[0],'user_url']
        user_loc        = user.loc[user.index[0],'user_loc']
        user_desc       = user.loc[user.index[0],'user_desc']
        
        # 9. Source
        source = tweet['source']

        # 10. Conversation_id
        conversation_id = tweet['conversation_id']

        # 11. Reply settings
        reply_settings = tweet['reply_settings']

        # 12. Referenced tweets
        if ('referenced_tweets' in tweet):   
            referenced_tweets_type = tweet['referenced_tweets'][0]['type']
            referenced_tweets_id   = tweet['referenced_tweets'][0]['id']
        else:
            referenced_tweets_type = " "
            referenced_tweets_id   = " "

        # 13. In reply to user id
        if ('in_reply_to_user_id' in tweet):  
            in_reply_to_user_id = tweet['in_reply_to_user_id']
        else:
            in_reply_to_user_id = " "
            
        # 14. Entities
        if ('entities' in tweet and 'annotations' in tweet['entities']):
            annotations = tweet['entities']['annotations']
        else:
            annotations = " "
        
        # 15. Mentions
        if ('entities' in tweet and 'mentions' in tweet['entities']):
            mentions = tweet['entities']['mentions']
        else:
            mentions = " "
        
        # 16. URLs
        if ('entities' in tweet and 'urls' in tweet['entities']):
            linked_url = tweet['entities']['urls'][0]['expanded_url']
        else:
            linked_url = " "
            
        # 17. Possibly sensitive
        if('possibly_sensitive' in tweet):
            possibly_sensitive = tweet['possibly_sensitive']
        else:
            possibly_sensitive = " "
        
        # Assemble all data in a list
        res = [tweet_id, created_at, author_id, 
               place_id, place_name, full_place_name, lat, long, exact_coords, bbox, place_type, country_code, country,
               lang, retweet_count, reply_count, like_count, quote_count, text, 
               username, user_name, followers_count, following_count, tweet_count, listed_count, user_url, user_loc, user_desc,
               source, conversation_id, reply_settings, referenced_tweets_type, referenced_tweets_id, in_reply_to_user_id, 
               annotations, mentions, linked_url, possibly_sensitive]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter)


# Function that uses looping to make multiple calls, to ensure the desired number of tweets during each period have been collected 
def call_tweets(keywords,start_list,end_list,max_results,max_count,pathway,user_nm = None):
    
    #Inputs for tweets
    bearer_token = auth()
    headers = create_headers(bearer_token)
    
    # If csv file does not exist, create and add header row
    if not os.path.isfile(pathway):
        # Create csv file
        csvFile = open(pathway, 'w', newline='', encoding='utf-8')
        # Format for writing
        csvWriter = csv.writer(csvFile)
        # Write headers in first row    
        csvWriter.writerow(['tweet_id', 'created_at', 'author_id', 
                            'place_id', 'place_name', 'full_place_name', 'lat', 'long', 'exact_coords', 'bbox', 'place_type', 'country_code', 'country', 
                            'lang', 'retweet_count', 'reply_count', 'like_count', 'quote_count', 'text', 
                            'username', 'user_name', 'followers_count', 'following_count', 'tweet_count', 'listed_count', 'user_url', 'user_loc', 'user_desc', 
                            'source', 'conversation_id', 'reply_settings', 'referenced_tweets_type', 'referenced_tweets_id', 'in_reply_to_user_id', 
                            'annotations', 'mentions', 'linked_url', 'possibly_sensitive'])
        # Close csv file
        csvFile.close()

    #Total number of tweets we collected from the loop
    total_tweets = 0
    
    # For loop which calls tweets until the max number per period (max_results) has been reached
    for i in range(0,len(start_list)):
        
        # Inputs
        count = 0 # Counting tweets per time period
        flag = True
        next_token = None
        
        # Check if flag is true
        while flag:
            
            # Check if max_count reached
            if count >= max_count:
                break
            
            # Call tweets
            print("-------------------")
            print("Token: ", next_token)
            url = create_url(keywords, start_list[i],end_list[i], max_results, user_nm = user_nm)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
            
            # Get number of tweets called
            result_count = json_response['meta']['result_count']
            
            # If 'next_token' provided
            if 'next_token' in json_response['meta']:
                # Save the token to use for next call
                next_token = json_response['meta']['next_token']
                # Print next_token
                print("Next Token: ", next_token)
                # If results have been successfully called and additional results are ready
                if result_count is not None and result_count > 0 and next_token is not None:
                    # Append results to csv and print progress
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, pathway)
                    count += result_count
                    total_tweets += result_count
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(5.5)                
            # If no next token exists
            else:
                # If results have been returned
                if result_count is not None and result_count > 0:
                    print('Next token:  None')
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, pathway)
                    count += result_count
                    total_tweets += result_count
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(5.5)
                # If no results returned
                else:
                    print("Start Date: ", start_list[i])
                    print("# of Tweets added from this response: 0")
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(1.1)
            
                # Since this is the final request, turn flag to false to move to the next time period.
                flag = False
                next_token = None
                
    # Print total number of tweets called            
    print("Total number of results: ", total_tweets)

# Function to call tweets whilst catching and treating errors
def call_tweets_try_except(keywords,mn,mx,start_list,end_list,max_results,max_count,pathway):
    # Create empty list to populate with users for whom data could not be retrieved
    failed_date_requests = []
    
    # Set the user error counter to 0. This object records how many consequtive users tweets could not be obtained for.
    # When it reaches 3, the function stops requesting tweets and sends and SMS reporting the errors.
    date_err = 0
    
    # For loop which iterates through users
    for dt in range(mn,mx):
        
        # Get the start and end time stamps to be requested in this loop iteration
        strt = []
        end  = []
        strt.append(start_list[dt])
        end.append(end_list[dt])
        
        # Extract date from end list
        try:
            int(start_list[dt][8:10])
        except:
            day = start_list[dt][0:9]
        else:
            day = start_list[dt][0:10]
        
        # Print the iteration number and username of the defined user
        print(' ')
        print('Date: ' + day)
        
        # Set the error counter to 0. This object records the number of times the call_tweets function has returned an 
        # exception for a given date. After 3 consecutive failures, the function will move onto the next date.
        err = 0
        
        # While loop which keeps attempting to call tweets until err >= 4
        while err < 4:
            # Attempt to call tweets
            try:
                call_tweets(keywords,strt,end,max_results,max_count,pathway)
            # If an exception occurs...
            except Exception as e:
                # Add 1 to the error counter
                err += 1
                # If less than 3 errors have occured for this date...
                if err < 4:
                    # Print error message
                    print('An error ocurred. Waiting 30s then trying again...')
                    print('Error message:: ' + str(e))
                    # Wait for 30s before trying again
                    time.sleep(30)
                # If 3 or more errors have occured for this date...
                else:
                    # Add 1 to the user error counter
                    date_err += 1
                    # Add the date to the list for which data could not be obtained.
                    failed_date_requests.append(day)
                    # If less then 3 consecutive dates have failed to return data...
                    if date_err < 3:
                        # Print error message
                        print('Persistent errors. Trying next date...')
                        print('Error message:: ' + str(e))
                    # If 3 or more consecutive dates have failed to return data...
                    else:
                        # Print error message reporting the users who have failed to return data
                        print('Persistent errors across dates ' + str(dt - 2) + '-' + str(dt) + '. Sending error message as SMS then halting requests...')
                        print('Error message:: ' + str(e))
                        # Send SMS reporting the error
                        message = client.messages \
                                        .create(
                                                body='TWITTER API UPDATE. Persistent errors across dates ' + str(dt - 2) + '-' + str(dt) + '. Sending error message as SMS then halting requests. Error message:: ' + str(e),
                                                from_='+441618505668',
                                                to='+447989165819'
                                                )
                        # Stop function
                        return
            # If tweets successfully called...
            else:
                # Reset user error counter to 0
                date_err = 0
                # Break out of while loop and move onto the next date
                break
                
    # Once all tweets have been successfully called, report the dates for which data could not be collected.
    print('')
    print('Finished calling tweets. Requests failed for the following dates: ' + str(failed_date_requests)[1:(len(str(failed_date_requests))-1)])
    
    # Send SMS with the same information
    message = client.messages \
                    .create(
                            body='TWITTER API UPDATE. Finished calling tweets. Requests failed for the following dates: ' + str(failed_date_requests)[1:(len(str(failed_date_requests))-1)],
                            from_='+441618505668',
                            to='+447989165819'
                            )    

# Function to call users' tweet whilst catching and treating errors
def call_users_try_except(usrs,mn,mx,start_list,end_list,max_results,max_count,pathway):
    # Create empty list to populate with users for whom data could not be retrieved
    failed_user_requests = []
    
    # Set the user error counter to 0. This object records how many consequtive users tweets could not be obtained for.
    # When it reaches 3, the function stops requesting tweets and sends and SMS reporting the errors.
    usr_err = 0
    
    # For loop which iterates through users
    for usr in range(mn,mx):
        
        # Extract the username for the user to be requested in this loop iteration
        user = usrs[usr]
        
        # Print the iteration number and username of the defined user
        print(' ')
        print('User ' + str(usr) + ': ' + user)
        
        # Set the error counter to 0. This object records the number of times the call_tweets function has returned an 
        # exception for a given user. After 3 consecutive failures, the function will move onto the next user.
        err = 0
        
        # While loop which keeps attempting to call tweets until err >= 4
        while err < 4:
            # Attempt to call tweets
            try:
                call_tweets(user,start_list,end_list,max_results,max_count,pathway,user_nm = True)
            # If an exception occurs...
            except Exception as e:
                # Add 1 to the error counter
                err += 1
                # If less than 3 errors have occured for this user...
                if err < 4:
                    # Print error message
                    print('An error ocurred. Waiting 30s then trying again...')
                    print('Error message:: ' + str(e))
                    # Wait for 30s before trying again
                    time.sleep(30)
                # If 3 or more errors have occured for this user...
                else:
                    # Add 1 to the user error counter
                    usr_err += 1
                    # Add the user's username to the list of users for whome data could not be obtained.
                    failed_user_requests.append(user)
                    # If less then 3 consecutive users have failed to return data...
                    if usr_err < 3:
                        # Print error message
                        print('Persistent errors. Trying next user...')
                        print('Error message:: ' + str(e))
                    # If 3 or more consecutive users have failed to return data...
                    else:
                        # Print error message reporting the users who have failed to return data
                        print('Persistent errors across users ' + str(usr - 2) + '-' + str(usr) + '. Sending error message as SMS then halting requests...')
                        print('Error message:: ' + str(e))
                        # Send SMS reporting the error
                        message = client.messages \
                                        .create(
                                                body='TWITTER API UPDATE. Persistent errors across users ' + str(usr - 2) + '-' + str(usr) + '. Sending error message as SMS then halting requests. Error message:: ' + str(e),
                                                from_='+441618505668',
                                                to='+447989165819'
                                                )
                        # Stop function
                        return
            # If tweets successfully called...
            else:
                # Reset user error counter to 0
                usr_err = 0
                # Break out of while loop and move onto the next user
                break
                
    # Once all tweets have been successfully called, report the users for whome data could not be collected.
    print('Finished calling tweets. Requests failed for the following users: ' + str(failed_user_requests)[1:(len(str(failed_user_requests))-1)])
    
    # Send SMS with the same information
    message = client.messages \
                    .create(
                            body='TWITTER API UPDATE. Finished calling tweets. Requests failed for the following users: ' + str(failed_user_requests)[1:(len(str(failed_user_requests))-1)],
                            from_='+441618505668',
                            to='+447989165819'
                            )

# Function to import and format csv files
def import_csv(x):
    # Read in tweets as dataframe
    x = pd.read_csv(x, low_memory = False)
    # Drop index column if added
    if 'Unnamed: 0' in x.columns:
        x = x.drop(['Unnamed: 0'], axis=1)
    # Convert date variable from str to datetime if included
    if 'created_at' in x.columns:
        x['created_at'] = pd.to_datetime(x['created_at'])
    # Return df
    return x

#### Call Tweets

In [None]:
### Inputs for the request
#keywords_list = ua_common_words + ' -is:retweet place_country:UA'
#
## Define other input parameters
#start_list  = get_datetimes(date(2021, 6, 1), date(2022, 3, 20), 'T00:00:00.00+02:00') # Start of search period (can enter list if searching multiple distinct periods)
#end_list    = get_datetimes(date(2021, 6, 1), date(2022, 3, 20), 'T23:59:59.99+02:00') # End   of search period (can enter list if searching multiple distinct periods)
#max_results = 500                          # Max tweets returned per call
#max_count   = 5000                         # Max tweets called in total per time period
#pathway     = rp2 + 'ukraine_tweets_01062021_20032022.csv' # Where to save / what to call resulting csv file
#
#call_tweets_try_except(keywords_list,0,len(start_list),start_list,end_list,max_results,max_count,pathway)

In [75]:
### Remove duplicate tweets that were created calling certain days multiple times
#
## Define pathway
#pathway     = rp2 + 'ukraine_tweets_01062021_20032022.csv' # Where to save / what to call resulting csv file
#
## Import tweets, remove duplicates, sort by datetime (earliest first) and reset index
#tweet_sample = import_csv(pathway).drop_duplicates() \
                                  #.sort_values(by='created_at').reset_index().drop(['index'], axis=1)
#
## Save adjusted results
#tweet_sample.to_csv(pathway)

#### Call Tweets for Usernames

In [5]:
# Import tweets from previous call
tweet_sample = import_csv(rp2 + 'ukraine_tweets_01062021_20032022.csv')

# Extract list of unique users
#usrs = list(set(tweet_sample[['author_id']].apply(lambda x: '%.5f' % x, axis = 1).apply(lambda x: str(int(float(x))))))
usrs = list(set(tweet_sample.loc[:,'username'].tolist()))

# Add geographical filter if needed
usrs_has_geo = list(map(lambda x: x + ' has:geo', usrs))

In [None]:
# Define other input parameters
start_list  = ['2021-06-01T00:00:00Z','2021-11-01T00:00:00Z'] # Start of search period (can enter list if searching multiple distinct periods)
end_list    = ['2021-10-31T23:59:59Z','2022-03-20T23:59:59Z'] # End   of search period (can enter list if searching multiple distinct periods)
range_min   = 0                            # What point in the username list to start
range_max   = len(usrs_has_geo)            # What point in the username list to end
max_results = 500                          # Max tweets returned per call
max_count   = 10000                        # Max tweets called in total per time period
pathway     = rp + dp + "tweets/user_name_tweets.csv" # Where to save / what to call resulting csv file

call_users_try_except(usrs,range_min,range_max,start_list,end_list,max_results,max_count,pathway)

In [27]:
#Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
next_token = None

url = create_url(usrs[57], '2021-07-01T01:00:00.000Z', '2021-07-31T13:00:00.000Z', 500, user_nm = True)

json_response = connect_to_endpoint(url[0], headers, url[1], next_token)

Endpoint Response Code: 200


In [32]:
import itertools
test = []
for i in itertools.chain(range(5, 6), [20, 30]):
    test.append(usrs[i])
print('The follows users were bad: ' + str(test)[1:(len(str(test))-1)])

The follows users were bad: 'CalabriaTOP', 'duram_p', 'po_z_nyakov'


In [17]:
test = [1,2,3,4,5,6]
print('The follows users were bad: ' + str(test)[1:(len(str(test))-1)])

The follows users were bad: 1, 2, 3, 4, 5, 6


In [None]:
# Define other input parameters
start_list  = ['2021-07-01T01:00:00.000Z','2022-02-01T01:00:00.000Z'] # Start of search period (can enter list if searching multiple distinct periods)
end_list    = ['2021-07-31T13:00:00.000Z','2022-03-09T13:00:00.000Z'] # End   of search period (can enter list if searching multiple distinct periods)
max_results = 500                          # Max tweets returned per call
max_count   = 400                          # Max tweets called in total per time period
pathway     = rp + dp + "tweets/user_name_tweets.csv" # Where to save / what to call resulting csv file

for usr in range(0,len(usrnms)):
    print('User: ' + str(usr))
    user = usrs[usr]
    call_tweets(user,start_list,end_list,max_results,max_count,pathway,user_nm = True)

#### Key elements of twitter V2 API call 

json_response['data']               # Info about tweets

json_response['includes']['users']  # Info about users mentioned in tweets

json_response['includes']['places'] # Info about places attached to tweets

json_response['includes']['tweets'] # Info about tweets that interact with primary tweets (e.g. replies, quotes)

json_response['errors']             # Errors that occured when calling tweets

json_response['meta']               # Meta data like newest and oldest tweets, total tweets called and next token if paginating

### Define Russian Query

In [None]:
# https://en.openrussian.org/list/all

ru_common_words = ['и', 'в', 'не', 'на', 'что', 'тот', 'быт', 'с', 'а', 'весь', 'как', 'по', 'но', 'э́то', 'к', 'у', 'из', 
                'за', 'так', 'же', 'сказа́ть', 'э́тот', 'кото́рый', 'мочь', 'о', 'челове́к', 'ещё', 'бы', 'тако́й', 'то́лько', 
                'себя́', 'како́й', 'для', 'уже́', 'когда́', 'кто', 'вот', 'да', 'год', 'знать', 'е́сли', 'до', 'говори́ть', 'и́ли', 
                'мой', 'вре́мя', 'рука́', 'са́мый', 'нет', 'ни', 'стать', 'большо́й', 'друго́й', 'свой', 'де́ло', 'под', 'где', 
                'что́бы', 'ну', 'сам', 'есть', 'раз', 'чём', 'там', 'глаз', 'пе́рвый', 'день', 'жизнь', 'тут', 'ничто́', 
                'пото́м', 'о́чень', 'ли', 'при', 'хоте́ть', 'на́до', 'голова́', 'без', 'ви́деть', 'тепе́рь', 'идти́', 'друг', 'сейча́с', 
                'стоя́ть', 'дом', 'то́же', 'по́сле', 'мо́жно', 'сло́во', 'че́рез', 'ме́сто', 'ду́мать', 'здесь', 'спроси́ть', 'лицо́', 
                'тогда́', 'до́лжный', 'ведь', 'но́вый', 'ка́ждый']

ru_common_words = cnctwb(ru_common_words)

### Call Russian Tweets

In [None]:
# Inputs for the request
keywords_list = ru_common_words + ' -is:retweet place_country:RU'

call_count += 1

# Define other input parameters
start_list  = ['2021-06-30T13:00:00.000Z','2021-07-01T13:00:00.000Z','2021-07-02T13:00:00.000Z',
               '2021-07-03T13:00:00.000Z','2021-07-04T13:00:00.000Z','2021-07-05T13:00:00.000Z',
               '2021-07-06T13:00:00.000Z','2021-07-07T13:00:00.000Z','2021-07-08T13:00:00.000Z','2021-07-09T13:00:00.000Z'] # Start of search period (can enter list if searching multiple distinct periods)
end_list    = ['2021-07-01T13:00:00.000Z','2021-07-02T13:00:00.000Z','2021-07-03T13:00:00.000Z',
               '2021-07-04T13:00:00.000Z','2021-07-05T13:00:00.000Z','2021-07-06T13:00:00.000Z',
               '2021-07-07T13:00:00.000Z','2021-07-08T13:00:00.000Z','2021-07-09T13:00:00.000Z','2021-07-10T13:00:00.000Z'] # End   of search period (can enter list if searching multiple distinct periods)
max_results = 10                         # Max tweets returned per call
max_count   = 3                         # Max tweets called in total per time period
pathway     = rp + dp + "tweets/russian_tweets_test_" + str(call_count) + ".csv" # Where to save / what to call resulting csv file

call_tweets(keywords_list,start_list[0:1],end_list[0:1],max_results,max_count,pathway)