In [1]:
from timeit import default_timer as timer
import itertools
import os
import sys
import uuid
from glob import glob
import json
import tweepy
import numpy as np
import pandas as pd
import multiprocessing as mp
import psutil
import socket

# Params

In [2]:
n_requests=1000
print('Save each',n_requests,'requests')

Save each 1000 requests


In [3]:
if 'samuel' in socket.gethostname().lower():
    path_to_data = '../../data'
else:
    path_to_data = '/scratch/spf248/twitter/data'

path_to_users = os.path.join(path_to_data,'users')
path_to_keys = os.path.join(path_to_data,'keys','twitter')
print(path_to_users)
print(path_to_keys)

../../data/decahose/parsed/users
../../data/keys
../../data/users


In [4]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

# Choose Number of Nodes To Distribute Credentials: e.g. jobarray=0-4, cpu_per_task=20, credentials = 90 (<100)
SLURM_JOB_ID            = get_env_var('SLURM_JOB_ID',0)
SLURM_ARRAY_TASK_ID     = get_env_var('SLURM_ARRAY_TASK_ID',0)
SLURM_ARRAY_TASK_COUNT  = get_env_var('SLURM_ARRAY_TASK_COUNT',1)
SLURM_JOB_CPUS_PER_NODE = get_env_var('SLURM_JOB_CPUS_PER_NODE',mp.cpu_count())

SLURM_JOB_ID : 0 (Default)
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 4 (Default)


# Credentials

In [5]:
def get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE):

    # Randomize set of key files using constant seed
    np.random.seed(0)
    all_key_files = np.random.permutation(glob(os.path.join(path_to_keys,'*.json')))
    
    # Split file list by node
    key_files = np.array_split(all_key_files,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
    
    # Check that node has more CPU than key file 
    if len(key_files) <= SLURM_JOB_CPUS_PER_NODE:
        print('# Credentials Allocated To Node:', len(key_files)) 
    else:
        print('Check Environment Variables: # credentials (',len(key_files),') > # CPU (', SLURM_JOB_CPUS_PER_NODE,')')
        print('Only Keeping', SLURM_JOB_CPUS_PER_NODE, 'Credentials')
        key_files = key_files[:SLURM_JOB_CPUS_PER_NODE]
        
    return key_files

key_files = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE)
print('\n'.join(key_files))

Check Environment Variables: # credentials ( 49 ) > # CPU ( 4 )
Only Keeping 4 Credentials
../../data/keys/WorldBankGroup6-dunstan.json
../../data/keys/spfraib_sentiments-sarah.json
../../data/keys/WorldBankGroup6-antoine.json
../../data/keys/spfraib_sentiments-sam6.json


In [6]:
def get_auth(key_file):
    
    # Import Key
    with open(key_file) as f:
        key = json.load(f)

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(key['consumer_key'], key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        api.verify_credentials()
#         print(key_file,": Authentication OK")
    except:
        print(key_file,": error during authentication")
        sys.exit('Exit')
    
    return api

for key_file in np.random.permutation(glob(os.path.join(path_to_keys,'*.json'))):
    get_auth(key_file)
print('Credentials Checked!')

Credentials Checked!


# User List

In [7]:
def get_users(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT):
    
    # Import Users
    users_by_account_location = pd.read_json(
    glob(os.path.join(path_to_users,'user-ids-by-account-location','*json'))[0],lines=True)
    
    all_users = list(itertools.chain.from_iterable(users_by_account_location['user_id']))

    # Randomize All Users
    np.random.seed(0)
    all_users=np.random.permutation(all_users)
    
    print('# Users:', len(all_users))
    print('First User Id:', all_users[0])
    
    # Split users by node
    users=np.array_split(all_users,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID].copy()
    print('Node"s # Users:', len(users))
    print('Node"s First User Id:', users[0])
    
    return users
    
start = timer()
print('Import Users...')

users = get_users(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Users...
# Users: 1937
First User Id: 322020836
Node"s # Users: 1937
Node"s First User Id: 322020836
Computing Time: 249 sec


In [22]:
print('Import Existing Users Profile')
l = []
for filename in sorted(glob(os.path.join(path_to_users,'users-profile/*.json'))):
    try:
        df = pd.read_json(filename,lines=True,dtype=False)
        l.append(df)
    except:
        print('error importing', filename)
existing_users = pd.concat(l, axis=0, ignore_index=True)['id_str'].drop_duplicates().tolist()
print('# Existing Users:', len(existing_users))

Import Existing Users Profile


In [29]:
users=list(set(users).difference(existing_users))
print('# Remaining Users:', len(users))

# Randomize Changing Order Each Run
np.random.seed()
users=np.random.permutation(users)

# Remaining Users: 1936


# Lookups

In [24]:
# Create a function called "chunks" with two arguments, l and n:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]

In [25]:
# Lookup users by chunks of size 100
def lookup_users(user_ids, api):
    if len(user_ids)>100:
        print('Reduce # Users')
        return []
    try:
        lookups=api.lookup_users(user_ids=list(user_ids),include_entities='true',tweet_mode='extended')
        return [lookup._json for lookup in lookups]
    except tweepy.error.TweepError as e:
        print('Lookup error', e)
        return []
    
# lookups=lookup_users(users[:100], get_auth(key_file))

In [26]:
def lookup_users_by_block(index_key):
    
    # Create Access For Block of Users
    api=get_auth(key_files[index_key])
    
    # Select Block of Users
    users_block=np.array_split(users,len(key_files))[index_key]
    
    # Split further by chunks of size 100 to accomodate Twitter lookup limit
    users_chunks=list(chunks(users_block,100))
    
    lookups=[]
    
    # Initialize Output File ID
    output_id = str(uuid.uuid4())
    
    for i,users_chunk in enumerate(users_chunks):
        
        # Loop users by chunk of size 100
        lookups.extend(lookup_users(users_chunk,api))
        
        # Save Lookups Each 100 Chunks
        if (i and not i%n_requests) or i==len(users_chunks)-1:
            
            print('Process', index_key, 'saving lookups with output id', output_id)
        
            # Save to Json
            filename = \
            'users-'+\
            str(SLURM_JOB_ID)+'-'+\
            str(SLURM_ARRAY_TASK_ID)+'-'+\
            str(index_key)+'-'+\
            output_id+'.json'
            with open(os.path.join(path_to_users,filename),'w') as f:
                json.dump(lookups,f)

            # Reset
            output_id = str(uuid.uuid4())
            lookups=[]

In [27]:
start = timer()
print('Lookup Users...\n')

with mp.Pool() as pool:
    
    pool.map(lookup_users_by_block, range(len(key_files)))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Lookup Users...

Process 3 saving lookups with output id ae576c5e-9437-4b67-8cd4-b58854191396
Process 0 saving lookups with output id fc44a8da-9b13-4d34-b9be-17849ba8f4db
Process 2 saving lookups with output id e18b820f-8043-4b34-aa9d-fdb5f6b46d25
Process 1 saving lookups with output id 5ab92913-952d-47e6-afb4-c5892a140ab2
Process 3 saving lookups with output id 9a30f644-ce27-4cf1-a590-1be722a96a61
Process 0 saving lookups with output id 36f36e4a-8953-48cb-817d-2409923c148c
Process 1 saving lookups with output id 155f7878-ad17-447e-892b-e14084a94144
Process 3 saving lookups with output id 8fc2bbf8-0488-4768-aa81-16db5b7faddd
Process 2 saving lookups with output id e2970c27-5847-43d2-94ce-6c9b06abad9e
Process 0 saving lookups with output id 5edac3db-073c-4009-b78c-2f4a12ef7961
Process 1 saving lookups with output id 119c3e48-fec4-454b-a6f0-f349111340d8
Process 2 saving lookups with output id 237c888a-5c69-4343-ba51-f3ee5acb6bd0
Computing Time: 16 sec
