In [1]:
from timeit import default_timer as timer
import itertools
import os
import sys
import uuid
from glob import glob
import json
import tweepy
import numpy as np
import pandas as pd
import multiprocessing as mp
import psutil
import socket

# Params

In [2]:
country = 'United States'
print('Country:', country)

Country: United States


In [3]:
cutoff = 1000
print('Save Data After Downloading',cutoff,'Timelines')

Save Data After Downloading 1000 Timelines


In [4]:
if 'samuel' in socket.gethostname().lower():
    path_to_data = '../../data'
else:
    path_to_data = '/scratch/spf248/twitter/data'

path_to_users = os.path.join(path_to_data,'decahose/parsed/users')
path_to_locations = os.path.join(path_to_data,'decahose/parsed/locations')
path_to_keys = os.path.join(path_to_data,'keys/twitter')
path_to_timelines = os.path.join(path_to_data,'timelines',country.lower().replace(' ','-'))
os.makedirs(path_to_timelines, exist_ok=True)
print(path_to_users)
print(path_to_locations)
print(path_to_keys)
print(path_to_timelines)

/scratch/spf248/twitter/data/decahose/parsed/users
/scratch/spf248/twitter/data/decahose/parsed/locations
/scratch/spf248/twitter/data/keys
/scratch/spf248/twitter/data/timelines/united-states


In [5]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

# Choose Number of Nodes To Distribute Credentials: e.g. jobarray=0-4, cpu_per_task=20, credentials = 90 (<100)
SLURM_JOB_ID            = get_env_var('SLURM_JOB_ID',0)
SLURM_ARRAY_TASK_ID     = get_env_var('SLURM_ARRAY_TASK_ID',0)
SLURM_ARRAY_TASK_COUNT  = get_env_var('SLURM_ARRAY_TASK_COUNT',1)
SLURM_JOB_CPUS_PER_NODE = get_env_var('SLURM_JOB_CPUS_PER_NODE',mp.cpu_count())

SLURM_JOB_ID : 6862147
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 1


# Credentials

In [6]:
def get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE):

    # Randomize set of key files using constant seed
    np.random.seed(0)
    all_key_files = np.random.permutation(glob(os.path.join(path_to_keys,'*.json')))
    
    # Split file list by node
    key_files = np.array_split(all_key_files,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
    
    # Check that node has more CPU than key file 
    if len(key_files) <= SLURM_JOB_CPUS_PER_NODE:
        print('# Credentials Allocated To Node:', len(key_files)) 
    else:
        print('Check Environment Variables: # credentials (',len(key_files),') > # CPU (', SLURM_JOB_CPUS_PER_NODE,')')
        print('Only Keeping', SLURM_JOB_CPUS_PER_NODE, 'Credentials')
        key_files = key_files[:SLURM_JOB_CPUS_PER_NODE]
        
    return key_files

key_files = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE)
print('\n'.join(key_files))

Check Environment Variables: # credentials ( 49 ) > # CPU ( 1 )
Only Keeping 1 Credentials
/scratch/spf248/twitter/data/keys/WorldBankGroup6-zohar.json


In [7]:
def get_auth(key_file):
    
    # Import Key
    with open(key_file) as f:
        key = json.load(f)

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(key['consumer_key'], key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        api.verify_credentials()
#         print(key_file,": Authentication OK")
    except:
        print(key_file,": error during authentication")
        sys.exit('Exit')
    
    return api

for key_file in np.random.permutation(glob(os.path.join(path_to_keys,'*.json'))):
    get_auth(key_file)
print('Credentials Checked!')

Credentials Checked!


# User List

In [8]:
print('Import Users By Account Locations')
start = timer()

l = []
for filename in sorted(glob(os.path.join(path_to_users,'user-ids-by-account-location-verified-json/*.json'))):
    try:
        df = pd.read_json(filename,lines=True)
        l.append(df)
    except:
        print('error importing', filename)
users_by_account_location=pd.concat(l, axis=0, ignore_index=True)
users_by_account_location=users_by_account_location.set_index('user_location')['user_id']
users_by_account_location=users_by_account_location.apply(eval).apply(lambda x:[str(y) for y in x])
print('# Locations:', len(users_by_account_location))
print('# Users Total:', users_by_account_location.apply(len).sum())

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Users By Account Locations
# Locations: 39779


In [15]:
print('Import Locations')
account_locations = pd.read_pickle(os.path.join(path_to_locations,'account-locations-identified.pkl')) 
print('# Locations:', len(account_locations))

Import Locations
# Locations: 39779


In [16]:
def get_users(country,SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT):  

    # Select Country Users
    all_users = list(itertools.chain.from_iterable(
    users_by_account_location.reindex(
    account_locations.loc[
    account_locations['country_long']==country,'LOCATION'])))
    
    # Randomize All Users
    np.random.seed(0)
    all_users=np.random.permutation(all_users)
    
    print('# Country Users:', len(all_users))
    print('First User Id:', all_users[0])
    
    # Split users by node
    users=np.array_split(all_users,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID].copy()
    print('Node"s # Users:', len(users))
    print('Node"s First User Id:', users[0])
    
    return users
    
start = timer()
print('Import Users...')

users = get_users(country,SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT)
del users_by_account_location
del account_locations
    
end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Users...
# Users: 21205171
First User Id: 2730239748
Node"s # Users: 21205171
Node"s First User Id: 2730239748
Computing Time: 9 sec


# Attempted Downloads

In [10]:
# Users Whose Timeline Were Successfully Downloaded
def get_success(path_to_timelines):
    
    if not os.path.exists(os.path.join(path_to_timelines, 'success')):
        return set()
    else:
        success = set()
        with open(os.path.join(path_to_timelines, 'success'), 'r', encoding='utf-8') as file:
            for line in file:
                success.add(line.strip('\n').split('\t')[0])
        return set(success)

success = get_success(path_to_timelines)
print('# Downloaded Timelines:', len(success))

# Downloaded Timelines: 0


In [11]:
# Users Whose Timeline Were Not Successfully Downloaded
def get_failure(path_to_timelines):
    
    if not os.path.exists(os.path.join(path_to_timelines,'failure')):
        return set()
    else:
        failure = set()
        with open(os.path.join(path_to_timelines,'failure'), 'r', encoding='utf-8') as file:
            for line in file:
                failure.add(line.strip('\n').split('\t')[0])
        return set(failure)

failure = get_failure(path_to_timelines)
print('# Missed Timelines:', len(failure))

# Missed Timelines: 0


# Get Timelines

In [12]:
def get_timeline(user_id,api):
    
    timeline = []
    
    try:
        
        # Collect All Status in Timeline
        cursor = tweepy.Cursor(
        api.user_timeline, 
        user_id=user_id, 
        count=3200,
        tweet_mode="extended", 
        include_rts=True).items()
        
        for status in cursor:
            timeline.append(status._json)
        del cursor
     
    except tweepy.error.TweepError as e:
        
        # Save User and Error Type
        with open(os.path.join(path_to_timelines,'failure'), 'a', encoding='utf-8') as file:
            file.write(user_id+'\t'+str(e)+'\n')
        
    return pd.DataFrame(timeline)

# timeline = get_user_timeline('12',get_auth(key_file))

In [13]:
def get_timelines_by_block(index_key):

    # Create Access For Block of Users
    api = get_auth(key_files[index_key])
    
    # Select Block of Users
    users_block = np.array_split(users,len(key_files))[index_key]
    
    # Initialize Output File ID
    output_id = str(uuid.uuid4())
    
    # Initialize Data
    timelines = pd.DataFrame()
    
    # Initialize Downloaded Users
    downloaded_ids = []
    
    for i,user_id in enumerate(users_block):
        
        # Skip Users With Downloaded Timeline
        if user_id in success:
#             print('Skip:', user_id)
            continue
           
        # Try Downloading Timeline
        timeline = get_timeline(user_id,api)
        
        if not timeline.shape[0]:
#             print('Missed:', user_id)
            continue
            
        # Append
        timelines = pd.concat([timelines, timeline],sort=False)
        downloaded_ids.append(user_id)
            
        # Save Every <cutoff> timelines including Last
        if len(downloaded_ids) == cutoff or user_id == users_block[-1]:
            
            print('Process', index_key, 'saving', len(downloaded_ids), 'timelines with output id', output_id)
            
            # Check Memory Usage
            pid = os.getpid()
            py = psutil.Process(pid)
            memoryUse = py.memory_info()[0]/2.**30  # memory use in GB...I think
            print('Process', index_key,'memory use:', round(memoryUse,3),'GB', 'after', i+1,'users')
            print('Process', index_key,'memory % used:', psutil.virtual_memory()[2])
            print('Process', index_key,'CPU%:', psutil.cpu_percent())
            
            # Save to Json
            filename = \
            'timelines-'+\
            str(SLURM_JOB_ID)+'-'+\
            str(SLURM_ARRAY_TASK_ID)+'-'+\
            str(index_key)+'-'+\
            str(len(downloaded_ids))+'-'+\
            output_id+'.json.bz2'
            
            # bz2 splittable for spark
            # save as list of dict
            # discard index
            timelines.to_json(
            os.path.join(path_to_timelines,filename),
            orient='records',
            force_ascii=False,
            date_format=None,
            double_precision=15)
            
            # Save User Id and File In Which Its Timeline Was Saved
            with open(os.path.join(path_to_timelines,'success'), 'a', encoding='utf-8') as file:
                for downloaded_id in downloaded_ids:
                    file.write(downloaded_id+'\t'+filename+'\n')
            
            # Reset Output File ID
            output_id = str(uuid.uuid4())
    
            # Reset Data and Downloaded Users
            del timelines, downloaded_ids
            timelines = pd.DataFrame()
            downloaded_ids = []
            
    return 0

In [14]:
start = timer()
print('Extract Timelines...\n')

with mp.Pool() as pool:
    
    pool.map(get_timelines_by_block, range(len(key_files)))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Extract Timelines...

Process 0 saving 10 timelines with output id bbfa03cb-8da3-43d5-a3dd-5f467f00c7bc
Process 0 memory use: 6.139 GB after # users: 19
Process 0 memory % used: 12.6
Process 0 CPU%: 55.3
Computing Time: 44 sec


In [None]:
# Read Like This
# pd.read_json(os.path.join(path_to_timelines,filename),
# orient='records',
# dtype=False,
# convert_dates=False)           

In [15]:
# with open('text.json.bz2','w') as f:
#     jack.to_json(f,
#     orient='records',
#     force_ascii=False,
#     date_format=None,
#     double_precision=15)

In [16]:
# with open('text.json.bz2','r') as f:
#     jack2 = pd.read_json(f,
#     orient='records',
#     dtype=False,
#     convert_dates=False)