In [2]:
import sys
import os
import tweepy
import ujson as json
import pandas as pd
import numpy as np
import multiprocessing as mp
from timeit import default_timer as timer
import re

In [3]:
country           = 'Brazil'
version_timeline  = 'v1'
version_users     = 'v3'
path_to_users     = './data/decahose/parsed/users/users-'+version_users+'.csv'
path_to_key_files = './data/api-keys/'
path_to_timelines = './data/timelines/'+country.lower()+'/'
start_key         = 0

if not os.path.exists(path_to_timelines):
    os.mkdir(path_to_timelines)
    
columns=[
'TIME',
'ID',
'TEXT',
'GEO',
'LANG',
'USER ID',
'USER LOCATION',
'USER STATUSES',
]
    
# Use Less Keys Than CPU Available
key_files = [x for x in os.listdir(path_to_key_files) if 'json' in x][start_key:start_key+mp.cpu_count()]
print('# Access Token:', len(key_files),'\n')
print('\n'.join(key_files))

# Access Token: 4 

gogps-anton2.json
gogps-carolina1.json
gogps-carolina2.json
gogps-dev.json


In [3]:
def collected_users(path_to_timelines,country):
    return [re.findall('([\d]+)',x)[0] 
            for x in os.listdir(path_to_timelines) 
            if country.lower() in x and '.pkl' in x]

collected_users = collected_users(path_to_timelines,country)
print('# Collected Users:', len(collected_users))

# Collected Users: 0


In [4]:
# %%time
print('Import Users...\n')
def select_users(country):

    all_users = pd.read_csv(path_to_users,index_col=0)
    users = list(all_users.loc[all_users['country']==country,'user_id'].sample(frac=1).values.astype(str))
    del all_users
    return users

users = select_users(country)

print('Country:', country)
print('# Users:', len(users))

users = [x for x in users if x not in collected_users]
print('# Remaining Users:', len(users))

Import Users...

Country: Indonesia
# Users: 4
# Remaining Users: 4


In [5]:
def get_keys(path):
    
    with open(path) as f:
        return json.load(f)
    
# key = get_keys(path_to_key_files+key_files[0])

In [6]:
def get_auth(key):

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(key['consumer_key'], key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True)
    
    return api

# api = get_auth(key)

In [7]:
def get_timeline(api,user):
    
    timeline = []
    
    try:
        
        # Collect Statuses Using Extended Text and Excluding Retweets 
        statuses = tweepy.Cursor(
        api.user_timeline, 
        user_id=user, 
        count=3200, 
        tweet_mode="extended", 
        include_rts=False).items()
        
        for status in statuses:
            
            # Pull Json Data 
            data = status._json
            
            # Select Fields
            timeline.append([
            data['created_at'],
            data['id_str'],
            data['full_text'],
            data['coordinates'],
            data['lang'],
            data['user']['id_str'],
            data['user']['location'],
            data['user']['statuses_count'],
            # data['user']['description'],
            # data['user']['profile_image_url'],
            ])
            
            del data, status
            
        del statuses
        
    except tweepy.error.TweepError as e:
#         print(user, e)
        pass

    return pd.DataFrame(timeline, columns=columns)

# timeline = get_timeline(api,users[0])

In [8]:
def get_timeline_by_block(index_block):
    
    # Create Access For Each Block
    key = get_keys(path_to_key_files+key_files[index_block])
    api = get_auth(key)
    
    for user in users[n_blocks*index_block:n_blocks*(index_block+1)]:
        
        timeline = get_timeline(api,user)

        if timeline.shape[0]:
            timeline.to_pickle(path_to_timelines+user+'-'+country.lower()+'-'+version_timeline+'.pkl')
            del timeline
            
    return 0

In [9]:
# Number of Process is Min(Number of CPU Available, Number of Keys)
processes = min(len(key_files),mp.cpu_count())
print('# Processes:',processes)

# Split Users By Block
n_blocks = len(users)//processes + len(users)%processes
print('# Users by Block:', n_blocks)  

# Processes: 4
# Users by Block: 1


In [10]:
# %%time
start = timer()
print('Extract Timelines...\n')

with mp.Pool(processes=processes) as pool:
    results = pool.map(get_timeline_by_block,range(processes))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Extract Timelines...

Computing Time: 35 sec


In [11]:
print('Done!')

Done!
