In [36]:
from timeit import default_timer as timer
import itertools
import os
import sys
import uuid
from glob import glob
import json
import tweepy
import numpy as np
import pandas as pd
import multiprocessing as mp
import psutil
import socket
from functools import partial

# Params

In [48]:
cutoff = 500
print('Save Data After Downloading',cutoff,'Timelines')

Save Data After Downloading 100 Timelines


In [4]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

# Choose Number of Nodes To Distribute Credentials: e.g. jobarray=0-4, cpu_per_task=20, credentials = 90 (<100)
SLURM_JOB_ID            = get_env_var('SLURM_JOB_ID',0)
SLURM_ARRAY_TASK_ID     = get_env_var('SLURM_ARRAY_TASK_ID',0)
SLURM_ARRAY_TASK_COUNT  = get_env_var('SLURM_ARRAY_TASK_COUNT',1)
SLURM_JOB_CPUS_PER_NODE = get_env_var('SLURM_JOB_CPUS_PER_NODE',mp.cpu_count())

SLURM_JOB_ID : 7987695
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 20


In [25]:
country_codes=[
# 'US',
# 'ID',
# 'BR',
# 'TR',
# 'MX',
# 'AR',
# 'PH',
# 'CO',
# 'MY',
# 'VE',
# 'TH',
# 'PK',
]

country_codes=[
'VE',
'BR',
'US',
]

In [6]:
if 'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/scratch/spf248/twitter/data'

path_to_users = os.path.join(path_to_data,'users')
path_to_locations = os.path.join(path_to_data,'locations','profiles')
path_to_keys = os.path.join(path_to_data,'keys','twitter')
path_to_timelines = os.path.join(path_to_data,'timelines','historical','API')
os.makedirs(path_to_timelines, exist_ok=True)
print(path_to_users)
print(path_to_locations)
print(path_to_keys)
print(path_to_timelines)

/scratch/spf248/twitter/data/users
/scratch/spf248/twitter/data/locations/profiles
/scratch/spf248/twitter/data/keys/twitter
/scratch/spf248/twitter/data/timelines/API


# Credentials

In [7]:
def get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE):

    # Randomize set of key files using constant seed
    np.random.seed(0)
    all_key_files = np.random.permutation(glob(os.path.join(path_to_keys,'*.json')))
    
    # Split file list by node
    key_files = np.array_split(all_key_files,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
    
    # Check that node has more CPU than key file 
    if len(key_files) <= SLURM_JOB_CPUS_PER_NODE:
        print('# Credentials Allocated To Node:', len(key_files)) 
    else:
        print('Check environment variables:')
        print('# Credentials (',len(key_files),') > # CPU (', SLURM_JOB_CPUS_PER_NODE,')')
        print('Only keeping', SLURM_JOB_CPUS_PER_NODE, 'credentials')
        key_files = key_files[:SLURM_JOB_CPUS_PER_NODE]
        
    return key_files

key_files = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE)
print('\n'.join(key_files))

Check environment variables:
# Credentials ( 49 ) > # CPU ( 20 )
Only keeping 20 credentials
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-zohar.json
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-trevor.json
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-henry.json
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-ananth.json
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-sam3.json
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-sarah.json
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-sam6.json
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-trevor.json
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-dunstan.json
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-fab.json
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-martin.json
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-sarah.json
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-carolin

In [8]:
def get_auth(key_file):
    
    # Import Key
    with open(key_file) as f:
        key = json.load(f)

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(key['consumer_key'], key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        api.verify_credentials()
        print(key_file,": Authentication checked")
    except:
        print(key_file,": error during authentication")
        sys.exit('Exit')
    
    return api

# for key_file in np.random.permutation(glob(os.path.join(path_to_keys,'*.json'))):
#     get_auth(key_file)
# print('Credentials Checked!')

# User List

In [9]:
print('Import Users By Account Locations')
start = timer()

l = []
for filename in sorted(glob(os.path.join(path_to_users,'user-ids-by-account-location-verified/*.json'))):
    try:
        df = pd.read_json(filename,lines=True)
        l.append(df)
    except:
        print('error importing', filename)
        
users_by_account_location=pd.concat(l, axis=0, ignore_index=True)
users_by_account_location=users_by_account_location.set_index('user_location')['user_id']
users_by_account_location=users_by_account_location.apply(eval).apply(lambda x:[str(y) for y in x])
print('# Locations:', len(users_by_account_location))
print('# Users Total:', users_by_account_location.apply(len).sum())

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Users By Account Locations
# Locations: 39779
# Users Total: 92088032
Computing Time: 170 sec


In [10]:
print('Import Locations')
account_locations=pd.read_pickle(os.path.join(path_to_locations,'account-locations.pkl')) 
print('# Locations:', len(account_locations))

Import Locations
# Locations: 39779


In [11]:
start = timer()
print('Select Users...')

# Sorted list of users in selected countries
users=pd.merge(
users_by_account_location.reindex(
account_locations.loc[
account_locations['country_short'].isin(country_codes),'user_location']).dropna().reset_index(),
account_locations[['user_location','country_short']]).drop('user_location',1).rename(
columns={'country_short':'country_code'}).explode('user_id').set_index('user_id')['country_code'].sort_index()

# Randomize users
users=users.sample(frac=1,random_state=0)

del users_by_account_location
del account_locations

print('# Users :', len(users)) 
print(users.reset_index().groupby('country_code').count())

end = timer()
print('Computing Time:', round(end - start), 'sec')

Select Users...
# Users : 27591849
               user_id
country_code          
BR             4863404
US            21205171
VE             1523274
Computing Time: 130 sec


In [14]:
start = timer()
print('Split Users Across Nodes...')

print('First user:', users.index[0])
users=np.array_split(users,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
print('# Users for this node:', len(users)) 
print('First user for this node:', users.index[0])

end = timer()
print('Computing Time:', round(end - start), 'sec')

Split Users Across Nodes...
# Users : 27591849
Computing Time: 2 sec


Nb of verified users in the US = 21,205,171

In [15]:
start = timer()
print('Remove users whose timeline were successfully downloaded...')

def get_success(country_code):
    
    if not os.path.exists(os.path.join(path_to_timelines, country_code, 'success')):
        return set()
    else:
        success = set()
        with open(os.path.join(path_to_timelines, country_code, 'success'), 'r', encoding='utf-8') as file:
            for line in file:
                success.add(line.strip('\n').split('\t')[0])
        return set(success)

success=set()
for country_code in country_codes:
    tmp=get_success(country_code)
    print(country_code, ':', len(tmp))
    success=success.union(tmp)
print('# downloaded timelines:', len(success))

users.drop(success,errors='ignore',inplace=True)
print('# remaining users for this node:', len(users))

# Group users by country
users_by_country=users.reset_index().groupby('country_code')['user_id'].apply(list).reindex(country_codes)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Remove users whose timeline were successfully downloaded...
BR : 3577650
US : 6257600
VE : 719150
# downloaded timelines: 10554400
# remaining users for this node: 17421031
Computing Time: 53 sec


# Get Timelines

In [46]:
def get_timeline(user_id,api):
    
    timeline = []
    error = None
    
    # Collect All Statuses in Timeline
    try:
        cursor = tweepy.Cursor(
        api.user_timeline, 
        user_id=user_id, 
        count=3200,
        tweet_mode="extended", 
        include_rts=True).items()
        
        for status in cursor:
            timeline.append(status._json)
     
    except tweepy.error.TweepError as e:
        error = str(e)
        
    return pd.DataFrame(timeline), error

# timeline = get_user_timeline('12',get_auth(key_file))

In [47]:
def download_timelines(index_key,country_code):

    # Create Access For Block of Users
    api = get_auth(key_files[index_key])
    
    # Select Block of Users
    users_block = np.array_split(users_by_country[country_code],len(key_files))[index_key]
    
    # Initialize Output File ID
    output_id = str(uuid.uuid4())
    
    # Initialize DataFrame
    timelines = pd.DataFrame()
    
    # Initialize Downloaded User List
    downloaded_ids = []
    
    for user_id in users_block:
        
        # Try Downloading Timeline
        timeline, error = get_timeline(user_id,api)
        
        if error!=None:
#             print(user_id,index_key,error)
            continue
            
        # Append
        timelines = pd.concat([timelines, timeline],sort=False)
        downloaded_ids.append(user_id)
            
        # Save after <cutoff> timelines or when reaching last user
        if len(downloaded_ids) == cutoff or user_id == users_block[-1]:
            
            filename = \
            'timelines-'+\
            str(SLURM_JOB_ID)+'-'+\
            str(SLURM_ARRAY_TASK_ID)+'-'+\
            str(index_key)+'-'+\
            str(len(downloaded_ids))+'-'+\
            output_id+'.json.bz2'
            
            print('Process', index_key, 'saving', len(downloaded_ids), 'timelines with output file:', 
            os.path.join(path_to_timelines,country_code,filename))
            
            # Save as list of dict discarding index
            timelines.to_json(
            os.path.join(path_to_timelines,country_code,filename),
            orient='records',
            force_ascii=False,
            date_format=None,
            double_precision=15)
            
            # Save User Id and File In Which Its Timeline Was Saved
            with open(os.path.join(path_to_timelines,country_code,'success'), 'a', encoding='utf-8') as file:
                for downloaded_id in downloaded_ids:
                    file.write(downloaded_id+'\t'+filename+'\n')
            
            # Reset Output File ID, Data, and Downloaded Users
            del timelines, downloaded_ids
            output_id = str(uuid.uuid4())
            timelines = pd.DataFrame()
            downloaded_ids = []
            
    return 0

In [49]:
print('Extract Timelines...\n')
with mp.Pool() as pool:
    for country_code in country_codes:
        print(country_code)
        pool.map(partial(download_timelines, country_code=country_code), range(len(key_files)))

Extract Timelines...

/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-sarah.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-sana.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-sarah.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-trevor.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-dev.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-mukund.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-henry.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-david.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-trevor.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-dunstan.json : Authentication checked
/scratch/spf248/twitt

Rate limit reached. Sleeping for: 23


/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-dharana.json : Authentication checked


Rate limit reached. Sleeping for: 18
Rate limit reached. Sleeping for: 28


/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-fab.json : Authentication checked


Rate limit reached. Sleeping for: 16
Rate limit reached. Sleeping for: 36
Rate limit reached. Sleeping for: 16
Rate limit reached. Sleeping for: 26


/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-ananth.json : Authentication checked


Rate limit reached. Sleeping for: 13
Rate limit reached. Sleeping for: 43


/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-martin.json : Authentication checked


Rate limit reached. Sleeping for: 233


Process 5 saving 9 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-5-9-46a4c29b-27db-4938-94d9-6bbab69c0941.json.bz2
Process 3 saving 9 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-3-9-09f19a79-7d5a-4903-a65f-fd13ba3f349d.json.bz2
Process 4 saving 8 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-4-8-8a987af5-c525-47c2-8690-8b2e0af31990.json.bz2
Process 1 saving 8 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-1-8-19a4f368-2602-45f6-ac72-794236ce44b5.json.bz2
Process 13 saving 10 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-13-10-6c6abdd3-6918-4020-8c2f-26cb40d0bddd.json.bz2


Rate limit reached. Sleeping for: 202


Process 15 saving 9 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-15-9-6eb49597-1c63-4f4e-816d-b8057980a738.json.bz2
Process 16 saving 9 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-16-9-886f19d6-15ef-4ec8-ad4a-6bd7e276f8b9.json.bz2
Process 0 saving 9 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-0-9-2f323a95-a85a-4f43-b5d1-c9e74feb0b9f.json.bz2
Process 19 saving 9 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-19-9-4405e9ec-0b5f-42e2-b3fa-94ec993743ea.json.bz2
Process 2 saving 8 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-2-8-0921b3f1-870c-4a18-bbfc-37a77e7cbd40.json.bz2
Process 7 saving 10 timelines with output file: /scratch/spf248/twitter/data/timelines/API/VE/timelines-7987695-0-7-10-44a929c2-75d6-4c76-8beb-46face305a68.json.bz2
Process 11 sav

/scratch/spf248/twitter/data/keys/twitter/spfraib_sentiments-trevor.json : Authentication checked
/scratch/spf248/twitter/data/keys/twitter/WorldBankGroup6-carolina2.json : Authentication checked
Process 14 saving 9 timelines with output file: /scratch/spf248/twitter/data/timelines/API/US/timelines-7987695-0-14-9-db89f877-5576-4584-b933-ed31a12937a9.json.bz2
Process 12 saving 8 timelines with output file: /scratch/spf248/twitter/data/timelines/API/US/timelines-7987695-0-12-8-59c7324a-105d-44c0-81dc-87f104f2cdc9.json.bz2
Process 8 saving 8 timelines with output file: /scratch/spf248/twitter/data/timelines/API/US/timelines-7987695-0-8-8-e08fef95-e79d-4aa9-8d1c-63fbc7882fc7.json.bz2
Process 9 saving 10 timelines with output file: /scratch/spf248/twitter/data/timelines/API/US/timelines-7987695-0-9-10-f0e345ef-6804-4412-875d-82e5399b785a.json.bz2
Process 10 saving 9 timelines with output file: /scratch/spf248/twitter/data/timelines/API/US/timelines-7987695-0-10-9-59e41204-9c22-44db-b087-02e