In [4]:
from timeit import default_timer as timer
import itertools
import os
import sys
import uuid
from glob import glob
import ujson as json
import tweepy
import numpy as np
import pandas as pd
import multiprocessing as mp
import psutil

In [5]:
country           = 'Mexico'
cutoff            = 1000 # Number of Users Timelines Per File
path_to_users     = '../data/decahose/parsed/users/'
path_to_locations = '../data/decahose/parsed/locations/'
path_to_keys      = '../data/keys/'
path_to_timelines = '../data/timelines/'+country.lower().replace(' ','-')+'/'
os.makedirs(path_to_timelines, exist_ok=True)

print('Country:', country)
print('Save Data After Downloading',cutoff,'Timelines')

Country: Mexico
Save Data After Downloading 1000 Timelines


In [6]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

# Choose Number of Nodes To Distribute Credentials: e.g. jobarray=0-4, cpu_per_task=20, credentials = 90 (<100)
SLURM_JOB_ID            = get_env_var('SLURM_JOB_ID',0)
SLURM_ARRAY_TASK_ID     = get_env_var('SLURM_ARRAY_TASK_ID',0)
SLURM_ARRAY_TASK_COUNT  = get_env_var('SLURM_ARRAY_TASK_COUNT',1)
SLURM_JOB_CPUS_PER_NODE = get_env_var('SLURM_JOB_CPUS_PER_NODE',mp.cpu_count())

SLURM_JOB_ID : 0 (Default)
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 4 (Default)


In [7]:
def get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE):

    # Randomize set of key files using constant seed
    np.random.seed(0)
    all_key_files = np.random.permutation(glob(path_to_keys+'*.json'))
    
    # Split file list by node
    key_files = np.array_split(all_key_files,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
    
    # Check that node has more CPU than key file 
    if len(key_files) <= SLURM_JOB_CPUS_PER_NODE:
        print('# Credentials Allocated To Node:', len(key_files)) 
    else:
        print('Check Environment Variables: # credentials (',len(key_files),') > # CPU (', SLURM_JOB_CPUS_PER_NODE,')')
        print('Only Use', SLURM_JOB_CPUS_PER_NODE, 'Credentials')
        key_files = key_files[:SLURM_JOB_CPUS_PER_NODE]
        
    return key_files

key_files = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE)
print('\n'.join(key_files))

Check Environment Variables: # credentials ( 86 ) > # CPU ( 4 )
Only Use 4 Credentials
../data/keys/spfraib_sentiments-sam3.json
../data/keys/spfraib_sentiments-david.json
../data/keys/vinnie_api_keys.json
../data/keys/gogps-carolina2.json


In [8]:
def get_users(country,SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT):
    
    users_by_account_location = pd.read_pickle(path_to_users+'users-by-account-location.pkl.xz')
    account_locations = pd.read_pickle(path_to_locations+'account-locations-identified.pkl')

    all_users = sorted(frozenset(itertools.chain.from_iterable(
    users_by_account_location.reindex(
    account_locations.loc[
    account_locations['country_long']==country,'LOCATION']).dropna().to_list())))
    
    del users_by_account_location
    del account_locations
    
    # Randomize All Users
    np.random.seed(0)
    all_users=np.random.permutation(all_users)
    
    print('# Users:', len(all_users))
    print('First User Id:', all_users[0])
    
    # Split users by node
    users=np.array_split(all_users,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID].copy()
    print('Node"s # Users:', len(users))
    print('Node"s First User Id:', users[0])
    
    return users
    
start = timer()
print('Import Users...')

users = get_users(country,SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Users...
# Users: 2651
First User Id: 3366741245
Node"s # Users: 2651
Node"s First User Id: 3366741245
Computing Time: 0 sec


In [9]:
# Users Whose Timeline Were Successfully Downloaded
def get_succeed(path_to_timelines):
    
    if not os.path.exists(path_to_timelines+'succeed.txt'):
        return set()
    else:
        succeed = set()
        with open(path_to_timelines+'succeed.txt', 'r', encoding='utf-8') as file:
            for line in file:
                succeed.add(line.strip('\n').split('\t')[0])
        return set(succeed)

succeed = get_succeed(path_to_timelines)
print('# Downloaded Timelines:', len(succeed))

# Downloaded Timelines: 4


In [10]:
# Users Whose Timeline Were Successfully Downloaded
def get_failed(path_to_timelines):
    
    if not os.path.exists(path_to_timelines+'failed.txt'):
        return set()
    else:
        failed = set()
        with open(path_to_timelines+'failed.txt', 'r', encoding='utf-8') as file:
            for line in file:
                failed.add(line.strip('\n').split('\t')[0])
        return set(failed)

failed = get_failed(path_to_timelines)
print('# Missed Timelines:', len(failed))

# Missed Timelines: 0


In [11]:
def get_auth(key_file):
    
    # Import Key
    with open(key_file) as f:
        key = json.load(f)

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(key['consumer_key'], key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        api.verify_credentials()
#         print(key_file,": Authentication OK")
    except:
        print(key_file,": error during authentication")
        sys.exit('Exit')
    
    return api


for key_file in key_files:
    get_auth(key_file)
print('Credentials Checked!')

Credentials Checked!


In [8]:
def get_user_timeline(user_id,api):
    
    timeline = []
    
    try:
        
        # Collect All Status in Timeline
        statuses = tweepy.Cursor(
        api.user_timeline, 
        user_id=user_id, 
        count=3200,
        tweet_mode="extended", 
        include_rts=True).items()
        
        for status in statuses:
            timeline.append(status._json)
        del statuses
     
    except tweepy.error.TweepError as e:
        
        # Save User and Error Type
        with open(path_to_timelines+'failed.txt', 'a', encoding='utf-8') as file:
            file.write(user_id+'\t'+str(e)+'\n')
        
    return pd.DataFrame(timeline)

# timeline = get_user_timeline('12',get_auth(key_file))

In [11]:
def get_timelines_by_block(index_key):

    # Create Access For Block of Users
    api = get_auth(key_files[index_key])
    
    # Select Block of Users
    users_block = np.array_split(users,len(key_files))[index_key]
    
    # Initialize Output File ID
    output_id = str(uuid.uuid4())
    
    # Initialize Data
    timelines = pd.DataFrame()
    
    # Initialize Downloaded Users
    downloaded_ids = []
    
    for i,user_id in enumerate(users_block):
        
        # Skip Users With Downloaded Timeline
        if user_id in succeed or user_id in failed:
#             print('Skip:', user_id)
            continue
           
        # Try Downloading Timeline
        timeline  = get_user_timeline(user_id,api)
        
        if not timeline.shape[0]:
#             print('Missed:', user_id)
            continue
            
        # Append
        timelines = pd.concat([timelines, timeline],sort=False)
        downloaded_ids.append(user_id)
            
        # Save Every <cutoff> timelines including Last
        if len(downloaded_ids) == cutoff or user_id == users_block[-1]:
            
            print('Process', index_key, 'saving', len(downloaded_ids), 'timelines with output id', output_id)
            
            # Check Memory Usage
            pid = os.getpid()
            py = psutil.Process(pid)
            memoryUse = py.memory_info()[0]/2.**30  # memory use in GB...I think
            print('Process', index_key,'memory use:', round(memoryUse,3),'GB', 'after # users:', i)
            
            # Save to Json
            filename = \
            'timelines-'+\
            str(SLURM_JOB_ID)+'-'+\
            str(SLURM_ARRAY_TASK_ID)+'-'+\
            str(index_key)+'-'+\
            str(len(downloaded_ids))+'-'+\
            output_id+'.json.bz2'
            
            # bz2 splittable for spark
            # save as list of dict
            # discard index
            timelines.to_json(
            path_to_timelines+filename,
            orient='records',
            force_ascii=False,
            date_format=None,
            double_precision=15)
            
            # Read Like This
            # pd.read_json(path_to_timelines+filename+'.json.bz2',
            # orient='records',
            # dtype=False,
            # convert_dates=False)
            
            # Save User Id and File In Which Its Timeline Was Saved
            with open(path_to_timelines+'succeed.txt', 'a', encoding='utf-8') as file:
                for downloaded_id in downloaded_ids:
                    file.write(downloaded_id+'\t'+filename+'\n')
            
            # Reset Output File ID
            output_id = str(uuid.uuid4())
    
            # Reset Data and Downloaded Users
            del timelines, downloaded_ids
            timelines = pd.DataFrame()
            downloaded_ids = []
            
    return 0

In [12]:
start = timer()
print('Extract Timelines...\n')

with mp.Pool() as pool:
    
    pool.map(get_timelines_by_block, range(len(key_files)))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Extract Timelines...

Process 0 saving 1 timelines with output id 9ea64359-93c7-4c83-bafc-15a7cdba663c
Process 0 memory use: 0.175 GB after # users: 0
Process 2 saving 1 timelines with output id 5c201455-0b5c-4b24-a7d1-49dfdcd8fcf4
Process 2 memory use: 0.178 GB after # users: 0
Process 1 saving 1 timelines with output id c91b3e58-d004-4530-b820-1da17eb2b23f
Process 1 memory use: 0.202 GB after # users: 0
Process 3 saving 1 timelines with output id 1ef9828e-a3c0-4281-8b04-456aecbfbf0a
Process 3 memory use: 0.202 GB after # users: 0
Computing Time: 15 sec


In [9]:
round(1000/110*84)*60*24

1100160

In [63]:
# with open('text.json.bz2','w') as f:
#     jack.to_json(f,
#     orient='records',
#     force_ascii=False,
#     date_format=None,
#     double_precision=15)

In [64]:
# with open('text.json.bz2','r') as f:
#     jack2 = pd.read_json(f,
#     orient='records',
#     dtype=False,
#     convert_dates=False)