In [1]:
from timeit import default_timer as timer
import itertools
import os
import sys
import uuid
from glob import glob
import json
import tweepy
import numpy as np
import pandas as pd
import multiprocessing as mp
import psutil
import socket
from functools import partial
import pyarrow.parquet as pq

# Params

In [2]:
cutoff = 1000
print('Save Data After Downloading',cutoff,'Timelines')

Save Data After Downloading 500 Timelines


In [4]:
country_codes=[
# 'US',
# 'ID',
# 'BR',
# 'TR',
# 'MX',
# 'AR',
# 'PH',
# 'CO',
# 'MY',
# 'VE',
# 'TH',
# 'PK',
]

country_code = "US"
print('Country:', country_code)

last_batch = '062020'
print('Last batch:', last_batch)

this_batch = '072020'
print('This batch:', this_batch)

Country: MX
Last batch: historical
This batch: 062020


In [3]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

# Choose Number of Nodes To Distribute Credentials: e.g. jobarray=0-4, cpu_per_task=20, credentials = 90 (<100)
SLURM_JOB_ID            = get_env_var('SLURM_JOB_ID',0)
SLURM_ARRAY_TASK_ID     = get_env_var('SLURM_ARRAY_TASK_ID',0)
SLURM_ARRAY_TASK_COUNT  = get_env_var('SLURM_ARRAY_TASK_COUNT',1)
SLURM_JOB_CPUS_PER_NODE = get_env_var('SLURM_JOB_CPUS_PER_NODE',mp.cpu_count())

SLURM_JOB_ID : 9977110
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 1


In [18]:
if 'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/scratch/spf248/twitter/data'

path_to_users = os.path.join(path_to_data,'timelines',last_batch,'most_recent_id')
path_to_keys = os.path.join(path_to_data,'keys','twitter')
path_to_timelines = os.path.join(path_to_data,'timelines',this_batch,'API')
os.makedirs(os.path.join(path_to_timelines, country_code), exist_ok=True)
print(path_to_users)
print(path_to_keys)
print(path_to_timelines)

/scratch/spf248/twitter/data/timelines/most_recent_id
/scratch/spf248/twitter/data/keys/twitter
/scratch/spf248/twitter/data/timelines/updates


# Credentials

In [6]:
def get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE):

    # Randomize set of key files using constant seed
    np.random.seed(0)
    all_key_files = np.random.permutation(glob(os.path.join(path_to_keys,'*.json')))
    
    # Split file list by node
    key_files = np.array_split(all_key_files,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
    
    # Check that node has more CPU than key file 
    if len(key_files) <= SLURM_JOB_CPUS_PER_NODE:
        print('# Credentials Allocated To Node:', len(key_files)) 
    else:
        print('Check environment variables:')
        print('# Credentials (',len(key_files),') > # CPU (', SLURM_JOB_CPUS_PER_NODE,')')
        print('Only keeping', SLURM_JOB_CPUS_PER_NODE, 'credentials')
        key_files = key_files[:SLURM_JOB_CPUS_PER_NODE]
        
    return key_files

key_files = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE)
print('\n'.join(key_files))

Check environment variables:
# Credentials ( 49 ) > # CPU ( 1 )
Only keeping 1 credentials
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-zohar.json


In [7]:
def get_auth(key_file):
    
    # Import Key
    with open(key_file) as f:
        key = json.load(f)

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(key['consumer_key'], key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        api.verify_credentials()
        print(key_file,": Authentication checked")
    except:
        print(key_file,": error during authentication")
        sys.exit('Exit')
    
    return api

for key_file in key_files:
    api=get_auth(key_file)
print('Credentials Checked!')

/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-zohar.json : Authentication checked
Credentials Checked!


# User List

In [8]:
start = timer()
print('Select Users...')

users=pq.ParquetDataset(glob(os.path.join(path_to_users,country_code,'*.parquet'))).read().to_pandas()

# Randomize users
users=users.sample(frac=1,random_state=0)

print('# Users :', len(users)) 

end = timer()
print('Computing Time:', round(end - start), 'sec')

Select Users...
# Users : 2619633
Computing Time: 4 sec


In [9]:
start = timer()
print('Split Users Across Nodes...')

print('First user:', users.index[0])
users=np.array_split(users,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
print('# Users for this node:', len(users)) 
print('First user for this node:', users.index[0])

end = timer()
print('Computing Time:', round(end - start), 'sec')

Split Users Across Nodes...
First user: 439466
# Users for this node: 2619633
First user for this node: 439466
Computing Time: 0 sec


In [19]:
start = timer()
print('Remove users whose timeline were successfully downloaded...')

def get_success(country_code):
    
    if not os.path.exists(os.path.join(path_to_timelines, country_code, 'success')):
        return set()
    else:
        success = set()
        with open(os.path.join(path_to_timelines, country_code, 'success'), 'r', encoding='utf-8') as file:
            for line in file:
                success.add(line.strip('\n').split('\t')[0])
        return set(success)

success=get_success(country_code)
print('# Downloaded timelines:', len(success))

users=users[-users.user_id.isin(success)].copy()
print('# Users :', len(users)) 

end = timer()
print('Computing Time:', round(end - start), 'sec')

Remove users whose timeline were successfully downloaded...
# downloaded timelines: 0
Computing Time: 0 sec


Nb of verified users in the US = 21,205,171

# Get Timelines

In [20]:
def get_timeline(user_id,tweet_id,api):
    
    timeline = []
    error = None
    
    # Collect All Statuses in Timeline
    try:
        cursor = tweepy.Cursor(
        api.user_timeline, 
        user_id=user_id, 
        since_id=tweet_id,
        count=3200,
        tweet_mode="extended", 
        include_rts=True).items()
        
        for status in cursor:
            timeline.append(status._json)
     
    except tweepy.error.TweepError as e:
        error = str(e)
        
    return pd.DataFrame(timeline), error

timeline = get_timeline('12','1266367509055209473',api)

In [35]:
def download_timelines(index_key):

    # Create Access For Block of Users
    api = get_auth(key_files[index_key])
    
    # Select Block of Users
    users_block = np.array_split(users,len(key_files))[index_key][['user_id','tweet_id']].values.tolist()
    
    # Initialize Output File ID
    output_id = str(uuid.uuid4())
    
    # Initialize DataFrame
    timelines = pd.DataFrame()
    
    # Initialize Downloaded User List
    downloaded_ids = []
    counter_ids = 0
    
    for (user_id,tweet_id) in users_block:
        
        # Try Downloading Timeline
        timeline, error = get_timeline(user_id,tweet_id,api)
        
        if error!=None:
#             print(user_id,index_key,error)
            continue
            
        # Append
        timelines = pd.concat([timelines, timeline],sort=False)
        downloaded_ids.append(user_id)
            
        # Save after <cutoff> timelines or when reaching last user
        if len(downloaded_ids) == cutoff or user_id == users_block[-1][0]:
            
            counter_ids += len(downloaded_ids)
            
            filename = \
            'timelines-'+\
            str(SLURM_JOB_ID)+'-'+\
            str(SLURM_ARRAY_TASK_ID)+'-'+\
            str(index_key)+'-'+\
            str(len(downloaded_ids))+'-'+\
            output_id+'.json.bz2'
            
            print('Process', index_key, 'downloaded', counter_ids, 'timelines with most recent output file:', 
            os.path.join(path_to_timelines,country_code,filename))
            
            # Save as list of dict discarding index
            timelines.to_json(
            os.path.join(path_to_timelines,country_code,filename),
            orient='records',
            force_ascii=False,
            date_format=None,
            double_precision=15)
            
            # Save User Id and File In Which Its Timeline Was Saved
            with open(os.path.join(path_to_timelines,country_code,'success'), 'a', encoding='utf-8') as file:
                for downloaded_id in downloaded_ids:
                    file.write(downloaded_id+'\t'+filename+'\n')
            
            # Reset Output File ID, Data, and Downloaded Users
            del timelines, downloaded_ids
            output_id = str(uuid.uuid4())
            timelines = pd.DataFrame()
            downloaded_ids = []
            
    return 0

In [36]:
print('Extract Timelines...\n')
with mp.Pool() as pool:
    pool.map(download_timelines, range(len(key_files)))

Extract Timelines...

/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-zohar.json : Authentication checked
Process 0 saving 5 timelines with output file: /scratch/spf248/twitter/data/timelines/updates/MX/timelines-9977110-0-0-5-1bc17591-06c8-4ec1-af49-2bd9ee90f522.json.bz2
