#### import the things

In [1]:
import tweepy
from datetime import datetime, timedelta, timezone
import pandas as pd
import json
import os
import random
import time

In [2]:
with open(r"..\path.txt") as p:
    path = p.readlines()[0]
import sys
sys.path.append(path)

import data_acquisition_tools as dat

%load_ext autoreload
%autoreload 2

#### load hashtags from previous step

In [3]:
hashtags = pd.read_pickle( r'..\data_files\common_hashtags.pkl')

In [4]:
hashtags

{'aids',
 'disclose',
 'endaids',
 'endhivepidemic',
 'endhivstigma',
 'hiv',
 'hivhometest',
 'hivstigma',
 'hivtestweek',
 'hivtreatment',
 'plhiv',
 'plwhiv',
 'prep',
 'sciencenotstigma',
 'uequalsu',
 'vaccinatethemostvulnerable',
 'vaccinatethemostvulneranle'}

In [5]:
#had to remove #disclose due to the recent DISCLOSE act from US congress
#removed the last one as I didn't realize initially it's a typo
hashtags.remove('disclose')
hashtags.remove('vaccinatethemostvulneranle')

In [6]:
#format the hashtags to add operators and hash symbol where necessary, as well as language filter
#need to remove keyword covid because otherwise it dominates the results
add_hash = ["#{}".format(h) if (h=='aids') or (h=='prep') else h for h in hashtags]
add_or = " OR ".join(add_hash)
query = "({}) -covid lang:en".format(add_or)
print(len(query))
print(query)

223
(uequalsu OR hivstigma OR hivtreatment OR hivhometest OR sciencenotstigma OR #prep OR endhivepidemic OR endaids OR plwhiv OR #aids OR hiv OR vaccinatethemostvulnerable OR hivtestweek OR plhiv OR endhivstigma) -covid lang:en


#### do a test search on the standard api (with generous rate limits) to make sure everything works

In [7]:
fname = r'..\data_files\test_file.json'

In [8]:
test = dat.api_search(query, fname, n_items=500, return_dict=True)
del test['query']

In [9]:
test[0]['created_at']

'Tue Apr 06 02:43:02 +0000 2021'

In [10]:
test[499]['created_at']

'Tue Apr 06 01:50:37 +0000 2021'

In [11]:
test[499]['full_text']

'RT @DrMikeStanton: HIV vaccine might be right around the corner.'

In [13]:
langs = []
for key in test.keys():
    langs.append(test[key]['lang'])

from collections import Counter
lang_counts = Counter(langs)
print(lang_counts.most_common(5))

[('en', 500)]


I had some trouble in previous pulls with the lang command in the query not working properly, but this seems to have worked. 

#### pull tweets

In order to account for the way topics cluster on twitter (due to intra-community discussion of the same topic over a short period of time), as well as potential time zone biases, every 2500 tweets (5 pulls), I set the timestamp to go back 2 days and randomize start time by subtracting a random number of hours (up to 23).

In [14]:
#set parameters to begin collecting tweets that are at least 1 week old
random.seed(42)
h = random.randint(0,23)
last_dt, last_str = dat.timestamp_to_toDate((datetime.now(timezone.utc) - timedelta(days=7, hours=h)).strftime("%a %b %d %H:%M:%S %z %Y"))

n = 1
saved_params = (last_dt, last_str, n)
saved_params

(datetime.datetime(2021, 3, 29, 6, 43, 47, tzinfo=datetime.timezone.utc),
 '202103290643',
 1)

In [15]:
#collect 20,000 tweets
last_dt, last_str = saved_params[:2]

n = saved_params[2]
while n <= 40:
    if n % 5 == 0:
        h = random.randint(0,23)
        last_dt, last_str = dat.timestamp_to_toDate((last_dt - timedelta(days=2, hours=h)).strftime("%a %b %d %H:%M:%S %z %Y"))
        print("going back 2 days")
    try:
        print(last_dt, "n={}".format(n))
        fname = os.path.join(r'..\data_files', 'main_search{}.json'.format(n))
        search = dat.api_advanced_search(query, fname, which_api='30day', todate=last_str, n_items=500, return_dict=True)
        last_dt, last_str = dat.timestamp_to_toDate(search[499]['created_at'])    
        n += 1
    except Exception as e:
        saved_params = last_dt, last_str, n
        print(saved_params)
        print("exception = ", e)
        print("...sleeping...")
        time.sleep(60)
        continue

2021-03-29 06:43:47+00:00 n=1
2021-03-29 04:58:08+00:00 n=2
2021-03-29 02:55:13+00:00 n=3
2021-03-29 00:56:26+00:00 n=4
going back 2 days
2021-03-26 19:50:02+00:00 n=5
2021-03-26 18:27:47+00:00 n=6
2021-03-26 17:04:45+00:00 n=7
(datetime.datetime(2021, 3, 26, 17, 4, 45, tzinfo=datetime.timezone.utc), '202103261704', 7)
exception =  {'message': 'Exceeded rate limit', 'sent': '2021-04-06T02:44:55+00:00', 'transactionId': '000265bc0033697f'}
...sleeping...
2021-03-26 17:04:45+00:00 n=7
2021-03-26 15:58:32+00:00 n=8
2021-03-26 14:43:43+00:00 n=9
going back 2 days
2021-03-24 13:49:06+00:00 n=10
2021-03-24 12:52:57+00:00 n=11
2021-03-24 11:44:02+00:00 n=12
2021-03-24 10:33:54+00:00 n=13
(datetime.datetime(2021, 3, 24, 10, 33, 54, tzinfo=datetime.timezone.utc), '202103241033', 13)
exception =  {'message': 'Exceeded rate limit', 'sent': '2021-04-06T02:46:17+00:00', 'transactionId': '00333bcf00758e10'}
...sleeping...
2021-03-24 10:33:54+00:00 n=13
2021-03-24 09:00:17+00:00 n=14
going back 2 day

going back 2 days
2021-01-29 09:23:32+00:00 n=40
(datetime.datetime(2021, 1, 29, 9, 23, 32, tzinfo=datetime.timezone.utc), '202101290923', 40)
exception =  {'message': "There were errors processing your request: Invalid 'toDate':'202101290923'. 'toDate' must be on or after 202103050306", 'sent': '2021-04-06T03:07:04+00:00', 'transactionId': '004c808100212fc8'}
...sleeping...
going back 2 days
2021-01-26 16:23:32+00:00 n=40
(datetime.datetime(2021, 1, 26, 16, 23, 32, tzinfo=datetime.timezone.utc), '202101261623', 40)
exception =  {'message': "There were errors processing your request: Invalid 'toDate':'202101261623'. 'toDate' must be on or after 202103050307", 'sent': '2021-04-06T03:08:05+00:00', 'transactionId': '0023364e00b80e99'}
...sleeping...
going back 2 days
2021-01-24 10:23:32+00:00 n=40
(datetime.datetime(2021, 1, 24, 10, 23, 32, tzinfo=datetime.timezone.utc), '202101241023', 40)
exception =  {'message': "There were errors processing your request: Invalid 'toDate':'202101241023

going back 2 days
2020-12-06 02:23:32+00:00 n=40
(datetime.datetime(2020, 12, 6, 2, 23, 32, tzinfo=datetime.timezone.utc), '202012060223', 40)
exception =  {'message': "There were errors processing your request: Invalid 'toDate':'202012060223'. 'toDate' must be on or after 202103050328", 'sent': '2021-04-06T03:29:08+00:00', 'transactionId': '007ea72000c5061f'}
...sleeping...
going back 2 days
2020-12-03 14:23:32+00:00 n=40
(datetime.datetime(2020, 12, 3, 14, 23, 32, tzinfo=datetime.timezone.utc), '202012031423', 40)
exception =  {'message': "There were errors processing your request: Invalid 'toDate':'202012031423'. 'toDate' must be on or after 202103050329", 'sent': '2021-04-06T03:30:09+00:00', 'transactionId': '004088fe00da3428'}
...sleeping...
going back 2 days
2020-12-01 11:23:32+00:00 n=40
(datetime.datetime(2020, 12, 1, 11, 23, 32, tzinfo=datetime.timezone.utc), '202012011123', 40)
exception =  {'message': "There were errors processing your request: Invalid 'toDate':'202012011123

KeyboardInterrupt: 