In [None]:
import json
import time
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import errno
import os
from collections import defaultdict

In [None]:
def get_lookup_dict(type='train', target='tweet'):
    path = f'./{type}_{target}'
    tweet2author = defaultdict(list)
    author2tweet = defaultdict(str)
    for file in os.listdir(path):
        with open(os.path.join(path, file)) as f:
            data = json.load(f)
            if 'id' in data and target=='tweet':
                tweet_id = data['id']
            elif 'pinned_tweet_id' in data and target == 'user':
                tweet_id = data['pinned_tweet_id']
            else: 
                tweet_id = ''
            if 'author_id' in data and target=='tweet':
                author_id = data['author_id']
            elif 'id' in data and target == 'user':
                author_id = data['id']
            else:
                author_id = ''
            
            author2tweet.setdefault(author_id, []).append(tweet_id)
            tweet2author.setdefault(tweet_id, author_id)

                
    with open(f'./tweet2author_{type}_{target}.json', 'w') as f:
        json.dump(tweet2author, f)
    with open(f'./author2tweet_{type}_{target}.json', 'w') as f:
        json.dump(author2tweet, f)
    return tweet2author, author2tweet

In [None]:
def get_id_chunks(**kwargs):
    ids = []
    size = kwargs.get('size', 100)
    source = kwargs.get('source', None)

    if type(source) == defaultdict or type(source) == dict:
        for k, v in source.items():
            if type(v) == list and '' not in v:
                ids.extend(v)
            elif type(v) == str and v != '':
                ids.append(v)
                
    elif type(source) == str:    
        with open(source) as f:
            lines = f.readlines()
            for line in lines:
                line = line.replace('\n', '')
                ids.extend(line.split(','))
        
    chunks = [ids[i:i+size] for i in range(0, len(ids), size)]
    return chunks

In [None]:
def make_requests(url, params, headers, folder='train_tweet', proxies={}):
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    response = session.get(
        url,
        headers=headers, 
        params=params,
        proxies=proxies)
        
    if 'status' in response.json() and response.json()['status']==429:
        current_time = time.strftime('%H:%M:%S',time.localtime(time.time()))
        print(f'{current_time}: rate limit exceeded, the program will sleep for 15 minutes')
        time.sleep(60*15)
        response = session.get(
            url,
            headers=headers, 
            params=params,
            proxies=proxies)
    
    data = response.json()
    
    return handle_response(data, folder)  

def handle_response(response, folder='train_tweet'):
    if 'data' in response:
        data = response['data']
        for item in data:
            id = item['id']
            with open(f'./{folder}/{id}.json', 'w+') as f:
                json.dump(item, f)
        return len(data)
    else:
        with open(f'./log_{folder}.txt', 'a+') as f:
            f.write(json.dumps(response, indent=4))
        return 0  
    
    
def data_harvester(chunks, folder, params, headers, target='tweets'):
    total = 0
    retrived = 0
    if not os.path.exists(f'./{folder}'):
        try:
            os.makedirs(f'./{folder}')
        except OSError as e: 
            if e.errno != errno.EEXIST:
                raise
            
    for c in chunks:
        total += len(c)
        params['ids'] = ','.join(c).replace('\n','')
        try:
            retrived += make_requests(
                f'https://api.twitter.com/2/{target}/',
                headers=headers, 
                params=params,
                folder=folder)
        except Exception as e:
            print(e)
        if total % 5000 == 0:
            print(f'Seen {total} {target}, retrived {retrived}, retrive rate {round(retrived/total*100,2)}%')

In [None]:
params = {
    'ids': [],
    'expansions': 'author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id',
    'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
    'poll.fields': 'duration_minutes,end_datetime,id,options,voting_status',
    'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld',
    'user.fields':'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld'}
params_usr = {
    'ids': [],
    'expansions': 'pinned_tweet_id',
    'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld',
    'user.fields':'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld'}

headers = {
    'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAI%2FnbgEAAAAASNSbePMKDZ2%2BrcK%2BLf6h44gPjMY%3Dw0xoJvCQZUyHHTTV1qJnMhpKhfqNX5WGxGSoFCIBLxOH6Avi9U',
}
win_proxies = {'http': '127.0.0.1:1080',
 'https': '127.0.0.1:1080',
 'ftp': 'ftp://127.0.0.1:1080'}

In [None]:
# users pinned tweets in train set
pinned_tweet2author_train, pinned_author2tweet_train = get_lookup_dict(target='user')
# split ids into chunks
chunks_tweet = get_id_chunks(source=pinned_author2tweet_train)
# get and save json files 
data_harvester(chunks_tweet, params, headers, folder='train_user_pinned')

In [None]:
# users pinned tweets in dev set
pinned_tweet2author_dev, pinned_author2tweet_dev = get_lookup_dict(type='dev', target='user')
# split ids into chunks
chunks_tweet = get_id_chunks(source=pinned_author2tweet_dev)
# get and save json files 
data_harvester(chunks_tweet, params, headers, folder='dev_user_pinned')

In [None]:
# get covid tweet's user data
with open('./author2tweet_covid.json') as f:
    author2tweet = json.load(f)
with open('./tweet2author_covid.json') as f:
    tweet2author = json.load(f)
    
chunks_user = get_id_chunks(source=tweet2author)
data_harvester(chunks_user, 'covid_user', params_usr, headers, target='users')

In [None]:
# users pinned tweets in covid set
pinned_tweet2author_covid, pinned_author2tweet_covid = get_lookup_dict(type='covid', target='user')
# split ids into chunks
chunks_user_covid = get_id_chunks(source=pinned_author2tweet_covid)
# get and save json files 
data_harvester(chunks_user_covid, 'covid_user_pinned', params, headers)