#### import the things

In [2]:
import tweepy
from datetime import datetime, timedelta, timezone
import pandas as pd
import json
import os
import random
import time

In [3]:
with open(r"..\path.txt") as p:
    path = p.readlines()[0]
import sys
sys.path.append(path)

import data_acquisition_tools as dat

%load_ext autoreload
%autoreload 2

#### load hashtags from previous step

In [4]:
hashtags = pd.read_pickle( r'..\data_files\common_hashtags.pkl')

In [5]:
hashtags

{'aids',
 'disclose',
 'endaids',
 'endhivepidemic',
 'endhivstigma',
 'hiv',
 'hivhometest',
 'hivstigma',
 'hivtestweek',
 'hivtreatment',
 'plhiv',
 'plwhiv',
 'prep',
 'sciencenotstigma',
 'uequalsu',
 'vaccinatethemostvulnerable',
 'vaccinatethemostvulneranle'}

In [6]:
#had to remove #disclose due to the recent DISCLOSE act from US congress
#removed the last one as I didn't realize initially it's a typo
hashtags.remove('disclose')
hashtags.remove('vaccinatethemostvulneranle')

In [7]:
#format the hashtags to add operators and hash symbol where necessary, as well as language filter
#need to remove keyword covid because otherwise it dominates the results
add_hash = ["#{}".format(h) if (h=='aids') or (h=='prep') else h for h in hashtags]
add_or = " OR ".join(add_hash)
query = "({}) -covid lang:en".format(add_or)
print(len(query))
print(query)

223
(endaids OR hivstigma OR sciencenotstigma OR vaccinatethemostvulnerable OR plwhiv OR endhivepidemic OR hivtestweek OR #aids OR hiv OR uequalsu OR hivhometest OR hivtreatment OR endhivstigma OR #prep OR plhiv) -covid lang:en


In [24]:
#need a shorter query for the full archive API
query2 = '(endaids OR hivstigma OR #aids OR hiv OR uequalsu OR hivtreatment OR endhivstigma OR #prep OR plhiv) -covid lang:en'
print(len(query2))
print(query2)

115
(endaids OR hivstigma OR #aids OR hiv OR uequalsu OR hivtreatment OR endhivstigma OR #prep OR plhiv) -covid lang:en


#### do a test search on the standard api (with generous rate limits) to make sure everything works

In [7]:
fname = r'..\data_files\test_file.json'

In [8]:
test = dat.api_search(query, fname, n_items=500, return_dict=True)
del test['query']

In [9]:
test[0]['created_at']

'Tue Apr 06 02:43:02 +0000 2021'

In [10]:
test[499]['created_at']

'Tue Apr 06 01:50:37 +0000 2021'

In [11]:
test[499]['full_text']

'RT @DrMikeStanton: HIV vaccine might be right around the corner.'

In [13]:
langs = []
for key in test.keys():
    langs.append(test[key]['lang'])

from collections import Counter
lang_counts = Counter(langs)
print(lang_counts.most_common(5))

[('en', 500)]


I had some trouble in previous pulls with the lang command in the query not working properly, but this seems to have worked. 

#### pull tweets

In order to account for the way topics cluster on twitter (due to intra-community discussion of the same topic over a short period of time), as well as potential time zone biases, every 2500 tweets (5 pulls), I set the timestamp to go back 2 days and randomize start time by subtracting a random number of hours (up to 23).

In [14]:
#set parameters to begin collecting tweets that are at least 1 week old
random.seed(42)
h = random.randint(0,23)
last_dt, last_str = dat.timestamp_to_toDate((datetime.now(timezone.utc) - timedelta(days=7, hours=h)).strftime("%a %b %d %H:%M:%S %z %Y"))

n = 1
saved_params = (last_dt, last_str, n)
saved_params

(datetime.datetime(2021, 3, 29, 6, 43, 47, tzinfo=datetime.timezone.utc),
 '202103290643',
 1)

In [None]:
#collect 20,000 tweets
last_dt, last_str = saved_params[:2]

n = saved_params[2]
while n <= 40:
    if n % 5 == 0:
        h = random.randint(0,23)
        last_dt, last_str = dat.timestamp_to_toDate((last_dt - timedelta(days=2, hours=h)).strftime("%a %b %d %H:%M:%S %z %Y"))
        print("going back 2 days")
    try:
        print(last_dt, "n={}".format(n))
        fname = os.path.join(r'..\data_files', 'main_search{}.json'.format(n))
        search = dat.api_advanced_search(query, fname, which_api='30day', todate=last_str, n_items=500, return_dict=True)
        last_dt, last_str = dat.timestamp_to_toDate(search[499]['created_at'])    
        n += 1
    except Exception as e:
        saved_params = last_dt, last_str, n
        print(saved_params)
        print("exception = ", e)
        print("...sleeping...")
        time.sleep(60)
        continue

In [17]:
#set parameters to begin collecting tweets that are at least 1 week old
random.seed(42)
h = random.randint(0,23)
last_dt, last_str = dat.timestamp_to_toDate((datetime.now(timezone.utc) - timedelta(days=31, hours=h)).strftime("%a %b %d %H:%M:%S %z %Y"))

n = 1
saved_params = (last_dt, last_str, n)
saved_params

(datetime.datetime(2021, 3, 6, 6, 11, 56, tzinfo=datetime.timezone.utc),
 '202103060611',
 1)

In [25]:
#collect 5,000 tweets using the full archive API
#start 1 month ago, go back 5 days between every pull
last_dt, last_str = saved_params[:2]

n = saved_params[2]
while n <= 10:
    h = random.randint(0,23)
    last_dt, last_str = dat.timestamp_to_toDate((last_dt - timedelta(days=5, hours=h)).strftime("%a %b %d %H:%M:%S %z %Y"))
    print("going back 5 days")
    try:
        print(last_dt, "n={}".format(n))
        fname = os.path.join(r'..\data_files', 'main_search{}.json'.format(n))
        search = dat.api_advanced_search(query2, fname, which_api='full', todate=last_str, n_items=500, return_dict=True)
        last_dt, last_str = dat.timestamp_to_toDate(search[499]['created_at'])    
        n += 1
    except Exception as e:
        saved_params = last_dt, last_str, n
        print(saved_params)
        print("exception = ", e)
        print("...sleeping...")
        time.sleep(60)
        continue

going back 5 days
2021-02-24 03:11:56+00:00 n=1
(datetime.datetime(2021, 2, 24, 3, 11, 56, tzinfo=datetime.timezone.utc), '202102240311', 1)
exception =  Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: /1.1/tweets/search/fullarchive/fullarchive2.json?toDate=202102240311&query=%28endaids+OR+hivstigma+OR+%23aids+OR+hiv+OR+uequalsu+OR+hivtreatment+OR+endhivstigma+OR+%23prep+OR+plhiv%29+-covid+lang%3Aen (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002161EF49400>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
...sleeping...
going back 5 days
2021-02-18 04:11:56+00:00 n=1
going back 5 days
2021-02-12 18:20:19+00:00 n=2
going back 5 days
2021-02-07 10:13:51+00:00 n=3
going back 5 days
2021-02-02 01:19:33