In [1]:
import os
import lzma
import ujson as json
import numpy as np
import pandas as pd
import multiprocessing as mp

In [2]:
######################## Params #############################

version = 'v3'
month   = '01'
year    = None

print('Year:', year)
print('Month:', month)

# path_to_input_files  = './data/decahose/'
# path_to_output_files = './data/decahose/'
path_to_input_files  = '/net/twitter/gardenhose-data/json/'
path_to_output_files = './data/decahose/users/'

#############################################################

# Get Output File
def get_output_file(version,year=None,month=None):
    
    output_file = 'user-id-'+version
    
    if year:
        output_file += '-'+str(year)
        
    if month:
        output_file += '-'+str(month)
        
    output_file+='.csv'
    
    return output_file

# Select Input Files
def get_input_files(path_to_input_files,year=None,month=None):

    input_files = [file for file in os.listdir(path_to_input_files) if file[-3:]=='.xz']
    
    if year:
        input_files = [file for file in input_files if file.replace('.','-').split('-')[2]==year]

    if month:
        input_files = [file for file in input_files if file.replace('.','-').split('-')[3]==month]

    # Randomize For Parallel Processing
    return np.random.permutation(input_files)

input_files = get_input_files(path_to_input_files,year,month)
print('# Input Files:', len(input_files))

output_file = get_output_file(version,year,month)
print('Output File:', output_file)

Year: None
Month: 12
# Input Files: 4
Output File: user-id-v3-12.csv


In [3]:
def parse_tweets(xz_file):
    
    tweets  = []
    
    with lzma.open(path_to_input_files+xz_file,'rb') as f:

        for line in f:

            # Only Select Tweets With Geocoordinates (Could Be in the Replies)
            if b'"coordinates":{' in line:
                
                # Json Parsing Can Fail
                try:
                    tweet = json.loads(line.decode("utf-8"))
                except:
                    continue

                # Only Selects Geocoordinates If Any and User Info from the Original Tweet
                if tweet.get('coordinates',None):

                    tweets.append([
                    tweet['user']['id_str'],
                    tweet['coordinates']['coordinates'][1], 
                    tweet['coordinates']['coordinates'][0], 
                    ])

    return pd.DataFrame(tweets, columns=['user_id','latitude','longitude'])

In [4]:
# %%time
print('Parse Tweets...')
with mp.Pool() as pool:
    tweets = pool.map(parse_tweets, input_files)

Parse Tweets...


In [5]:
tweets = pd.concat(tweets)
users  = tweets.groupby('user_id').agg({'latitude':'mean','longitude':'mean',})

print('# Tweets:', tweets.shape[0])
print('# Users:', users.shape[0])
print()

# Tweets: 4552
# Users: 1072



In [6]:
print('Save User Ids...')
users.to_csv(path_to_output_files+output_file)
print('Done!')
print()

Save User Ids:
Done!

