# Collecting Tweets Using Academic API V2

Based on code from [here](https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a). A full list of the Academic Twitter API V2 query parameters can be found [here](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all) and [here](https://developer.twitter.com/en/docs/twitter-api/enterprise/rules-and-filtering/operators-by-product).

### 1. Import libraries and define pathway variables

In [1]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
# To add wait time between requests
import time
# For matching string expressions
import re

# Define rootpath
rp = 'C:\\Users\\sgmmahon\\Documents\\GitHub\\iom_project\\'
mp = 'methods\\accessing_tweets\\'
dp = 'data\\tweet_data\\' 

### 2. Define functions

In [2]:
# Add Bearer Token as environmental variable
#os.environ['TOKEN'] = 'BEARER-TOKEN-HERE'
os.environ['TOKEN'] = "AAAAAAAAAAAAAAAAAAAAABLt8wAAAAAAbE8U4tJJtQffIvLTtuHWVzm0nqI%3DIinQPSHHGhYjXwG71L4AHkUnUmlP9X8npfKQ0S6Nk07nBSF8AY"

# Define function which retrieves token from the environment
def auth():
    return os.getenv('TOKEN')

# Define function that uses bearer token to create headers used to access the API
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

# Function to create url to make GET request
def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {# Required parameters
                    'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    # Additional parameters which can be requested optionally
                    'expansions': 'author_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id,geo.place_id,entities.mentions.username',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities,possibly_sensitive',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified,location,url',
                    'place.fields': 'full_name,id,country,country_code,contained_within,geo,name,place_type',
                    # ID to call next page of tweets
                    'next_token': {}}
    return (search_url, query_params)

# Function to make GET request
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

### 2. Build search terms

In [3]:
# Create functions which concatenates vectors
def cnct (x): return(" OR ".join(x))
def cnctwb (x): return("(" + " OR ".join(x) + ")")

In [4]:
# UK Search Terms# UK Search Terms

# Migrant terms
uk_neutral_migrant_terms  = ["immigrant", "immigration", "migrant", "migration", "\"asylum seeker\"", "refugee", "\"undocumented worker\"", "\"guest worker\"", 
                             "\"EU worker\"", "\"non-UK workers\"", "\"foreign worker\"", "(human smuggling)", "(human trafficking)"]
uk_negative_migrant_terms = ["illegals", "foreigner", "\"illegal alien\"", "\"illegal worker\""]
 
# Racial terms
uk_negative_racial_terms  = ["islamophob", "sinophob", "\"china flu\"", "\"kung flu\"", "\"china virus\"", "\"chinese virus\"", "shangainese"]

# Twitter accounts
uk_pro_migrant_account_1  = ["@UNmigration", "@IOM_UN", "@IOMatUN", "@IOMatEU", "@IOM_UK", "@IOMResearch", "@IOM_GMDAC", "@hrw", "@Right_to_Remain",
                             "@CommonsHomeAffs", "@fcukba", "@Mark_George_QC", "@MigrantVoiceUK", "@MigrantChildren", "@MigrantHelp", "@thevoiceofdws"]
uk_pro_migrant_account_2  = ["@WORCrights", "@UbuntuGlasgow", "@MigrantsUnionUK", "@migrants_rights", "@MigrantsMRC", "@Consenant_UK", "@RomaSupport",
                             "@MigrantsLawProj", "@MigRightsScot", "@IRMOLondon", "@HighlySkilledUK", "@WeBelong19", "@Project17UK"]
uk_neutral_account        = ["@ukhomeoffice", "@pritipatel", "@UKHomeSecretary", "@EUHomeAffairs", "@MigrMatters", "@MigObs"]
uk_anti_migrant_account   = ["@Nigel_Farage", "@MigrationWatch"]

# Hashtags
uk_positive_hashtags      = ["#RefugeesWelcome", "#MigrantsWelcome", "#LeaveNoOneBehind", "#FreedomForImmigrants", "#illegalmigantsUK", "#LondonIsOpen",
                             "#EndHostileEnvironment", "#FamiliesBelongTogether"]
uk_neutral_hashtags       = ["#Pritiuseless", "#migrationEU", "#immigration", "#migration", "#immigrant", "#migrant", "#immigrate", "#migrate", "#refugees",
                             "#NigelFarage", "#ImmigrationReform"]
uk_negative_hashtags      = ["#illegals", "#foreigner", "#foreigners", "#illegalalien", "#illegalaliens", "#illegalworker", "#illegalworkers", "#KeepThemOut",
                             "#OurCountry", "#SendThemBack", "#migrantsnotwelcome", "#refugeesnotwelcome", "#illegals", "#ChinaVirus", "#chinaflu", "#kungflu",
                             "#chinesevirus", "#TheyHaveToGoBack", "#DeportThemAll"]
uk_event_hashtags         = ["#Moria", "#CampFire", "#closethecamps"]

# Define final search queries
uk_terms    = cnctwb([cnct(uk_neutral_migrant_terms), cnct(uk_negative_migrant_terms), cnct(uk_negative_racial_terms)])
uk_accounts = cnctwb([cnct(uk_pro_migrant_account_1), cnct(uk_pro_migrant_account_2), cnct(uk_neutral_account), cnct(uk_anti_migrant_account)])
uk_hashtags = cnctwb([cnct(uk_positive_hashtags), cnct(uk_neutral_hashtags), cnct(uk_negative_hashtags), cnct(uk_event_hashtags)])

# Append all search term into single list
uk_search_terms = uk_neutral_migrant_terms + uk_negative_migrant_terms + uk_negative_racial_terms + uk_pro_migrant_account_1 + \
                  uk_pro_migrant_account_2 + uk_neutral_account + uk_positive_hashtags + uk_neutral_hashtags + uk_negative_hashtags + uk_event_hashtags


# USA Search Terms

# Migrant terms
usa_neutral_migrant_terms  = ["immigrant", "immigration", "migrant", "migration", "\"asylum seeker\"", "refugee", "\"undocumented worker\"", "\"guest worker\"", 
                              "\"foreign worker\"", "(human smuggling)", "(human trafficking)"]
usa_negative_migrant_terms = ["illegals", "foreigner", "\"illegal alien\"", "\"illegal worker\""]
 
# Racial terms
usa_negative_racial_terms  = ["islamophob", "sinophob", "\"china flu\"", "\"kung flu\"", "\"china virus\"", "\"chinese virus\"", "shangainese"]

# Twitter accounts
usa_pro_migrant_account_1  = ["@UNmigration", "@IOM_UN", "@IOMatUN", "@IOMatEU", "@IOM_UK", "@IOMResearch", "@IOM_GMDAC", "@hrw", "@NIJC", "@CIYJA", 
                              "@ImmAdvocates", "@NWIRP", "@RAICESTEXAS", "@ImmJusticeNOW", "@icirr", "@IAmAnImmigrant", "@Am4ImmJustice", "@NILCJusticeFund"]
usa_pro_migrant_account_2  = ["@CIRCimmigrant", "@FLImmigrant", "@ImmFamTogether", "@ImmJustice", "@NICE4Workers", "@CA4ImmiJustice", "@immigrantarc", "@Join_SIM",
                              "@SDIRC", "@RMIAN_org", "@NJAIJ", "@NVImmigrants", "@VA_Immigrants"]
usa_neutral_account        = ["@ICEgov", "@PhillyOIA", "@iandraffairs", "@LAC4Immigrants", "@CoreCivic"]
usa_anti_migrant_account   = ["@EuropidWhites"]

# Hashtags
usa_positive_hashtags      = ["#RefugeesWelcome", "#MigrantsWelcome", "#LeaveNoOneBehind", "#FreedomForImmigrants", "#illegalmigantsUSA", "#KillTheImmigrationBill", 
                              "#ImmigrantsMakeAmericaGreat", "#NoWall", "#NoWallEver", "#NoBan", "#FamiliesBelongTogether", "#stopICEcold", "#EndRemainInMexico"]
usa_neutral_hashtags       = ["#ICE", "#immigration", "#migration", "#immigrant", "#migrant", "#immigrate", "#migrate", "#refugees", "#ImmigrationReform"]
usa_negative_hashtags      = ["#illegals", "#foreigner", "#foreigners", "#illegalalien", "#illegalaliens", "#illegalworker", "#illegalworkers", "#KeepThemOut",
                              "#OurCountry", "#SendThemBack", "#migrantsnotwelcome", "#refugeesnotwelcome", "#illegals", "#ChinaVirus", "#chinaflu", "#kungflu",
                              "#chinesevirus", "#TheyHaveToGoBack", "#DeportThemAll"]
usa_event_hashtags         = ["#Moria", "#closethecamps", "#divestfromdetention"]

# Define final search queries
usa_terms    = cnctwb([cnct(usa_neutral_migrant_terms), cnct(usa_negative_migrant_terms), cnct(usa_negative_racial_terms)])
usa_accounts = cnctwb([cnct(usa_pro_migrant_account_1), cnct(usa_pro_migrant_account_2), cnct(usa_neutral_account), cnct(usa_anti_migrant_account)])
usa_hashtags = cnctwb([cnct(usa_positive_hashtags), cnct(usa_neutral_hashtags), cnct(usa_negative_hashtags), cnct(usa_event_hashtags)])

# Append all search term into single list
usa_search_terms = usa_neutral_migrant_terms + usa_negative_migrant_terms + usa_negative_racial_terms + usa_pro_migrant_account_1 + \
                   usa_pro_migrant_account_2 + usa_neutral_account + usa_positive_hashtags + usa_neutral_hashtags + usa_negative_hashtags + usa_event_hashtags



# Country Search Terms
uk_add_terms     = ' lang:en place_country:GB'
usa_add_terms    = ' lang:en place_country:US'

### 3. Make single request

In [5]:
# Inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = uk_terms + uk_add_terms
start_time = "2021-03-01T00:00:00.000Z"
end_time = "2021-03-31T00:00:00.000Z"
max_results = 15

In [None]:
# Create url for request
url = create_url(keyword, start_time,end_time, max_results)

# Make request
json_response = connect_to_endpoint(url[0], headers, url[1])

# Show results for first 6 tweets
#print(json.dumps(json_response['data'][0:5], indent=4, sort_keys=True))

In [7]:
# Function to append results to a csv
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. Source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']

        # 9. Conversation_id
        conversation_id = tweet['conversation_id']

        # 10. Reply settings
        reply_settings = tweet['reply_settings']

        # 11. Referenced tweets
        if ('referenced_tweets' in tweet):   
            referenced_tweets_type = tweet['referenced_tweets'][0]['type']
            referenced_tweets_id   = tweet['referenced_tweets'][0]['id']
        else:
            referenced_tweets_type = " "
            referenced_tweets_id   = " "

        # 12. In reply to user id
        if ('in_reply_to_user_id' in tweet):  
            in_reply_to_user_id = tweet['in_reply_to_user_id']
        else:
            in_reply_to_user_id = " "
            
        # 13. Entities
        if ('annotations' in tweet['entities']):
            annotations = tweet['entities']['annotations']
        else:
            annotations = " "
        
        # 14. Mentions
        if ('mentions' in tweet['entities']):
            mentions = tweet['entities']['mentions']
        else:
            mentions = " "
        
        # 15. URLs
        if ('urls' in tweet['entities']):
            linked_url = tweet['entities']['urls'][0]['expanded_url']
        else:
            linked_url = " "
            
        # 16. Possibly sensitive
        if('possibly_sensitive' in tweet):
            possibly_sensitive = tweet['possibly_sensitive']
        else:
            possibly_sensitive = " "
        
        # Assemble all data in a list
        res = [author_id, created_at, geo, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text,
               conversation_id, reply_settings, referenced_tweets_type, referenced_tweets_id, in_reply_to_user_id,
               annotations,mentions,linked_url,possibly_sensitive]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

In [8]:
# Create file
csvFile = open(rp + dp + "test_tweets.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet',
                    'coversation_id','reply_settings','referenced_tweets_type','referenced_tweets_id','in_reply_to_user_id',
                    'annotations','mentions','linked_url','possibly_sensitive'])
csvFile.close()

# Append rows to csv
append_to_csv(json_response, rp + dp + "test_tweets.csv")

# of Tweets added from this response:  15


### 4. Loop multiple requests

#### 4a. Define functions

In [5]:
# Function to obtain variable names
def get_var_name(variable):
 for name in globals():
     if eval(name) == variable:
        return(name)


# Function to append results to a csv
def append_to_csv(json_response, fileName, searchterm):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count   = tweet['public_metrics']['reply_count']
        like_count    = tweet['public_metrics']['like_count']
        quote_count   = tweet['public_metrics']['quote_count']

        # 7. Source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']

        # 9. Conversation_id
        conversation_id = tweet['conversation_id']

        # 10. Reply settings
        reply_settings = tweet['reply_settings']

        # 11. Referenced tweets
        if ('referenced_tweets' in tweet):   
            referenced_tweets_type = tweet['referenced_tweets'][0]['type']
            referenced_tweets_id   = tweet['referenced_tweets'][0]['id']
        else:
            referenced_tweets_type = " "
            referenced_tweets_id   = " "

        # 12. In reply to user id
        if ('in_reply_to_user_id' in tweet):  
            in_reply_to_user_id = tweet['in_reply_to_user_id']
        else:
            in_reply_to_user_id = " "
            
        # 13. Entities
        if ('entities' in tweet and 'annotations' in tweet['entities']):
            annotations = tweet['entities']['annotations']
        else:
            annotations = " "
        
        # 14. Mentions
        if ('entities' in tweet and 'mentions' in tweet['entities']):
            mentions = tweet['entities']['mentions']
        else:
            mentions = " "
        
        # 15. URLs
        if ('entities' in tweet and 'urls' in tweet['entities']):
            linked_url = tweet['entities']['urls'][0]['expanded_url']
        else:
            linked_url = " "
            
        # 16. Possibly sensitive
        if('possibly_sensitive' in tweet):
            possibly_sensitive = tweet['possibly_sensitive']
        else:
            possibly_sensitive = " "
            
        # 17. Search type
        search_type = get_var_name(searchterm)
        
        # Assemble all data in a list
        res = [author_id, created_at, geo, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text,
               conversation_id, reply_settings, referenced_tweets_type, referenced_tweets_id, in_reply_to_user_id,
               annotations,mentions,linked_url,possibly_sensitive,search_type]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 


# Function that uses looping to make multiple calls, to ensure the desired number of tweets during each period have been collected 
def call_tweets(keywords,start_list,end_list,max_results,max_count,pathway):
    
    #Inputs for tweets
    bearer_token = auth()
    headers = create_headers(bearer_token)

    #Total number of tweets we collected from the loop
    total_tweets = 0

    # Create file
    csvFile = open(pathway, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    # Create headers for the data you want to save, in this example, we only want save these columns in our dataset
    csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet',
                        'coversation_id','reply_settings','referenced_tweets_type','referenced_tweets_id','in_reply_to_user_id',
                        'annotations','mentions','linked_url','possibly_sensitive','search_type'])
    csvFile.close()

    # For loop which cycles through different search terms
    for keyword in keywords:
    
        # Define search term type list
        search_term_type = ['terms','accounts',]
    
        # For loop which calls tweets until the max number per period (max_results) has been reached
        for i in range(0,len(start_list)):

            # Inputs
            count = 0 # Counting tweets per time period
            flag = True
            next_token = None
    
            # Check if flag is true
            while flag:
                # Check if max_count reached
                if count >= max_count:
                    break
                print("-------------------")
                print("Token: ", next_token)
                url = create_url(keyword, start_list[i],end_list[i], max_results)
                json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
                result_count = json_response['meta']['result_count']

                if 'next_token' in json_response['meta']:
                    # Save the token to use for next call
                    next_token = json_response['meta']['next_token']
                    print("Next Token: ", next_token)
                    if result_count is not None and result_count > 0 and next_token is not None:
                        print("Start Date: ", start_list[i])
                        append_to_csv(json_response, pathway, keyword)
                        count += result_count
                        total_tweets += result_count
                        print("Total # of Tweets added: ", total_tweets)
                        print("-------------------")
                        time.sleep(5)                
                # If no next token exists
                else:
                    if result_count is not None and result_count > 0:
                        print("-------------------")
                        print("Start Date: ", start_list[i])
                        append_to_csv(json_response, pathway, keyword)
                        count += result_count
                        total_tweets += result_count
                        print("Total # of Tweets added: ", total_tweets)
                        print("-------------------")
                        time.sleep(5)
            
                    #Since this is the final request, turn flag to false to move to the next time period.
                    flag = False
                    next_token = None
                time.sleep(5)
        print("Total number of results: ", total_tweets)

#### 4a. Call UK tweets

In [None]:
# Concatenate search terms into a list of single objects
#uk_terms_add    = uk_terms    + uk_add_terms
#uk_accounts_add = uk_accounts + uk_add_terms
#uk_hashtags_add = uk_hashtags + uk_add_terms
#keywords_list   = [uk_terms_add, uk_accounts_add, uk_hashtags_add]
#
## Define other input parameters
#start_list  = ['2020-03-01T00:00:00.000Z'] # Start of search period (can enter list if searching multiple distinct periods)
#end_list    = ['2021-03-08T00:00:00.000Z'] # End   of search period (can enter list if searching multiple distinct periods)
#max_results = 500                          # Max tweets returned per call
#max_count   = 500000                       # Max tweets called in total per time period
#pathway     = rp + dp + "tweets/uk_all_tweets_01032020_08032020.csv" # Where to save / what to call resulting csv file
#
#call_tweets(keywords_list,start_list,end_list,max_results,max_count,pathway)

#### 4b. Call USA tweets

In [None]:
# Concatenate search terms into a list of single objects
#usa_terms_add    = usa_terms    + usa_add_terms
#usa_accounts_add = usa_accounts + usa_add_terms
#usa_hashtags_add = usa_hashtags + usa_add_terms
#keywords_list    = [usa_terms_add, usa_accounts_add, usa_hashtags_add]
#
## Define other input parameters
#start_list  = ['2020-03-01T00:00:00.000Z'] # Start of search period (can enter list if searching multiple distinct periods)
#end_list    = ['2021-03-08T00:00:00.000Z'] # End   of search period (can enter list if searching multiple distinct periods)
#max_results = 500                          # Max tweets returned per call
#max_count   = 500000                       # Max tweets called in total per time period
#pathway     = rp + dp + "tweets/usa_all_tweets_01032020_08032020.csv" # Where to save / what to call resulting csv file
#
#call_tweets(keywords_list,start_list,end_list,max_results,max_count,pathway)

## 5. Clean text for vader analysis

#### 5a. Define functions

In [37]:
# Function which converts reduced tweets to a dataframe and preps data for vader lexicon
def to_df_vader(df):
    
    # Create column for vader input
    df['VADER_text'] = df['tweet'].apply(lambda x: re.sub('@(.*?) ', '@anonymous ', x)) # Change accounts for '@anonymous'
    # Remove urls and line breaks
    df['VADER_text'] = df['VADER_text'].apply(lambda x: re.sub(r"http\S+", "http://url_removed", str(x)) )\
                                       .apply(lambda x: re.sub('http(.*?) ', "http://url_removed", str(x)) )\
                                       .apply(lambda x: re.sub('\n', ' ', str(x)) )
    
    # Subset to remove tweets dicussing bird or data migration
    nature_and_data = ['bird', 'ornithology', '#wildlife', '#deer', '#buck', '#antlers', '#nature', 
                       'github', 'Azure', 'Microsoft', 'Ubuntu', 'Python', 'SQL', 'cloud', 'bigdata', 'big data', 'virtual machine', 'DevOps']
    # Define phrases prevent tweets from being falsely labeled as noise
    exclude_words  = ['cloud cuckoo land', 'two birds one stone', 'two birds with one stone', '@Nigel_Farage', 'NigelFarage', 'Nigel Farage', 'refugee']
    # Remove tweets which contain 'nature_and_data', but not 'exclude_words'
    df = df[~df['tweet'].replace(exclude_words,'', regex=True)
                       .str.contains('|'.join(nature_and_data), case = False)].reset_index(drop = True)
    
    # Return final dataframe
    return(df)

#### 5b. Process UK tweets

In [34]:
# Import tweets
uk_all_tweets_01032020_08032020 = pd.read_csv(rp + dp + "tweets/uk_all_tweets_01032020_08032020.csv")

# Create vader text
uk_all_tweets_01032020_08032020 = to_df_vader(uk_all_tweets_01032020_08032020)

# Save output
uk_all_tweets_01032020_08032020.to_csv(rp + dp + "tweets/uk_all_tweets_01032020_08032020_vader.csv")

#### 5c. Process US tweets

In [38]:
# Import tweets
usa_all_tweets_01032020_08032020 = pd.read_csv(rp + dp + "tweets/usa_all_tweets_01032020_08032020.csv")

# Create vader text
usa_all_tweets_01032020_08032020 = to_df_vader(usa_all_tweets_01032020_08032020)

# Save output
usa_all_tweets_01032020_08032020.to_csv(rp + dp + "tweets/usa_all_tweets_01032020_08032020_vader.csv")

In [10]:
uk_all_tweets_01032020_08032020 = pd.read_csv(rp + dp + "tweets/uk_all_tweets_01032020_08032020_vader.csv")

In [121]:
def get_ann(ann):
    if ann != ' ':
        try:
            ann = json.loads(ann.replace("'", "\""))[0]
            if ann['type'] == 'Person':
                ann = ann['normalized_text']
            else:
                ann = ' '
        except:
            ann = 'Failed'
    return(ann)



uk_all_tweets_01032020_08032020['people'] = uk_all_tweets_01032020_08032020['annotations'].map(lambda x: get_ann(x) )

In [128]:
uk_all_tweets_01032020_08032020['people'].value_counts()

              94048
Nigel          1880
Trump           922
Farage          915
Boris           674
              ...  
Jay ho            1
Al Murray         1
Burton            1
Nationlist        1
shilton           1
Name: people, Length: 4262, dtype: int64

In [133]:
uk_all_tweets_01032020_08032020.loc[uk_all_tweets_01032020_08032020['people'] == 'British Pakistanis']

Unnamed: 0.1,Unnamed: 0,author id,created_at,geo,id,lang,like_count,quote_count,reply_count,retweet_count,...,reply_settings,referenced_tweets_type,referenced_tweets_id,in_reply_to_user_id,annotations,mentions,linked_url,possibly_sensitive,VADER_text,people
2,2,1230637606129602560,2021-03-07 21:33:01+00:00,778909dfad43f3d6,1368676084150067206,en,0,0,0,0,...,everyone,replied_to,1368545775869964288,198305643,"[{'start': 63, 'end': 80, 'probability': 0.3357, 'type': 'Person', 'normalized_text': 'British Pakistanis'}, {'start': 117, 'end': 130, 'probability': 0.4778, 'type': 'Organization', 'normalized_text': 'Labour Council'}, {'start': 166, 'end': 168, 'probability': 0.5731, 'type': 'Organization', 'normalized_text': 'BBC'}]","[{'start': 0, 'end': 11, 'username': 'OletaBanks', 'id': '198305643'}, {'start': 12, 'end': 28, 'username': 'markber75420991', 'id': '1080551719317262339'}, {'start': 29, 'end': 41, 'username': 'BBCPanorama', 'id': '15944663'}, {'start': 42, 'end': 49, 'username': 'BBCOne', 'id': '871686942'}]",,False,"@anonymous @anonymous @anonymous @anonymous Yes we know, British Pakistanis raped the young white girls in all Labour Council run areas, I am an immigrant, but BBC 💩stirring again, especially after the debacle we had with the hoors BLM",British Pakistanis


In [129]:
with pd.option_context("display.max_rows", 5000):
    print(uk_all_tweets_01032020_08032020['people'].value_counts())

                                                         94048
Nigel                                                     1880
Trump                                                      922
Farage                                                     915
Boris                                                      674
Priti Patel                                                425
Priti                                                      379
Nigel Farage                                               346
Johnson                                                    257
Patel                                                      243
Tory                                                       208
trump                                                      152
Cummings                                                   148
Biden                                                      142
Boris Johnson                                              134
Covid                                                  

In [119]:
string = uk_all_tweets_01032020_08032020.annotations[490].replace("\'", "\"").replace("'", "\"")
re.sub(r"(?<=[^\W\d_])'(?=[^\W\d_])", "test", string)

'[{"start": 101, "end": 113, "probability": 0.9988, "type": "Person", "normalized_text": "James O"Brien"}]'

In [117]:
import re

s = "'didn't'"
print(s.sub("'","\""))

AttributeError: 'str' object has no attribute 'sub'

In [120]:
for i in range(2000):
    ann = uk_all_tweets_01032020_08032020.annotations[i]
    if ann != ' ':
        try:
            ann = json.loads(ann.replace("\'", "\"").replace("'", "\""))[0]
            if ann['type'] == 'Person':
                ann = ann['normalized_text']
                print(str(i),ann)
            else:
                print(str(i), False)
        except:
            print(str(i),'Failed')
    else:
        print(str(i), False)

0 False
1 False
2 British Pakistanis
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 Tony Blair
11 False
12 Giles
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
30 False
31 False
32 False
33 False
34 False
35 False
36 False
37 False
38 False
39 False
40 False
41 False
42 False
43 False
44 False
45 False
46 False
47 Farage
48 Ron Holder
49 False
50 False
51 False
52 Carpetbag
53 False
54 False
55 False
56 False
57 False
58 False
59 Trump
60 Ron Holder
61 False
62 False
63 False
64 False
65 False
66 False
67 False
68 False
69 Priti Patel
70 False
71 False
72 False
73 False
74 False
75 False
76 False
77 False
78 False
79 False
80 False
81 Ruth Davidson
82 False
83 False
84 False
85 False
86 False
87 False
88 False
89 False
90 False
91 False
92 False
93 False
94 False
95 False
96 False
97 False
98 False
99 False
100 Jonathan Gullis
101 False
102 False
103 False
104 False
105

In [72]:
for i in range(2000):
        test = uk_all_tweets_01032020_08032020.annotations[i]
        if test != ' ':
            if json.loads(test.replace("'", "\""))[0]['type'] == 'Person':
                print(str(i),json.loads(test.replace("'", "\""))[0]['normalized_text'])            
        else:
            print(str(i), False)

1 False
2 British Pakistanis
3 False
6 False
9 False
10 Tony Blair
12 Giles
13 False
14 False
19 False
20 False
21 False
23 False
26 False
31 False
36 False
37 False
41 False
42 False
43 False
45 False
47 Farage
48 Ron Holder
49 False
50 False
51 False
52 Carpetbag
54 False
55 False
56 False
57 False
58 False
59 Trump
60 Ron Holder
61 False
64 False
65 False
66 False
68 False
69 Priti Patel
70 False
71 False
73 False
74 False
76 False
77 False
81 Ruth Davidson
82 False
86 False
87 False
89 False
90 False
93 False
95 False
97 False
98 False
99 False
100 Jonathan Gullis
102 False
103 False
106 Yvette Cooper
108 Biden
112 False
116 False
118 False
120 False
122 Johnson
124 False
125 Ady
126 False
127 False
128 selena
129 False
130 False
134 False
135 False
136 False
138 Boris Johnson
139 False
140 False
142 Blair
143 Tony Blair
144 False
147 False
149 False
150 Swanee Hunt
151 False
152 False
153 False
155 False
157 False
160 False
162 False
165 False
166 Dhes
168 False
170 False
171 Fals

JSONDecodeError: Expecting ',' delimiter: line 1 column 98 (char 97)