In [1]:
import re
import os
import sys
import socket
from glob import glob
import numpy as np
import lzma
import ujson as json
import pandas as pd
from timeit import default_timer as timer

In [2]:
hostname2partition={
# 'achtung02':0,
'achtung03':0,
'achtung04':1,
'achtung05':2,
'achtung06':3,
'achtung07':4,
'achtung08':5,
# 'achtung09':0,
'achtung10':6,
'achtung11':7,
'achtung12':8,
'achtung13':9,
'achtung14':10,
'achtung15':11,
'achtung16':12,
# 'achtung17':0,
}
print('# Partitions:', len(hostname2partition))

block_size = 25
print('# Input File Parsed per Output File:', block_size)

hostname = socket.gethostname()
print('Hostname:', hostname)

partition = hostname2partition.get(hostname,0)
print('Index Partition:', partition)

# Partitions: 13
# Input File Parsed per Output File: 25
Hostname: FAC38c9860d5a89
Index Partition: 0


In [3]:
# Create Array of Input Files Until 2018
def get_input_files(partition,year_min=2011,year_max=2018):
    
    # Tweets Stored on Cluster 
    if os.getcwd() == '/home/sfraiberger/py':
        path_to_input_files  = '/net/twitter/gardenhose-data/json/'

    # For testing
    elif os.getcwd() == '/Users/samuel.fraiberger/Dropbox/Work/Projects/twitter/ipynb':
        path_to_input_files  = '../data/decahose/json/'

    else:
        sys.exit('Incorrect working directory...exiting.')
        
    input_files = glob(path_to_input_files+'tweets.json.*.xz')
    
    input_files = [input_file for input_file in input_files 
                   if int(re.findall('(\d+)-\d+-\d+',input_file)[0]) in range(year_min,year_max+1)]
    
    np.random.seed(0)
    input_files = np.random.permutation(input_files)
    
    print('# Input Files:', len(input_files))
    print('First File:', input_files[0])
    print('Last File:', input_files[-1])
    print()
    
    partitioned_files = np.array_split(input_files, len(hostname2partition))
    input_files = partitioned_files[partition]
    
    print('Partition List of Input Files:')
    print('# Files:', len(input_files))
    print('First File:', input_files[0])
    print('Last File:', input_files[-1])

    return input_files

input_files = get_input_files(partition)

# Input Files: 1
First File: ../data/decahose/json/tweets.json.2016-12-23.xz
Last File: ../data/decahose/json/tweets.json.2016-12-23.xz

Partition List of Input Files:
# Files: 1
First File: ../data/decahose/json/tweets.json.2016-12-23.xz
Last File: ../data/decahose/json/tweets.json.2016-12-23.xz


In [4]:
# Create Path to Output File
def get_output_root(partition):
    
    path_to_output_files = '../data/decahose/parsed/users/'
    
    os.makedirs(path_to_output_files, exist_ok=True)
    
    output_file = 'users-from-decahose-partition-'+str(partition)
        
    return path_to_output_files+output_file

print('Output Root:')
get_output_root(partition)

Output Root:


'../data/decahose/parsed/users/users-from-decahose-partition-0'

In [5]:
# There Could Be Some Duplicated Rows
def get_users_and_locations(input_file):
    
    tweets = []

    with lzma.open(input_file,'rb') as f:

        for line in f:

            # Only Select Tweets With Account Location (Could Be in the Replies)
            if b'"location":"' in line:

                # Json Parsing Can Fail
                try:
                    
                    tweet = json.loads(line.decode("utf-8"))
                    
                    tweets.append([
                    tweet.get('user', {}).get('id_str',None), 
                    tweet.get('user', {}).get('location',None),
                    ])

                except:
                    
                    continue
                
    return pd.DataFrame(tweets,columns=['USER ID','USER LOCATION'])

In [6]:
def main():

    users = pd.DataFrame()

    for i, input_file in enumerate(input_files):

        start = timer()
        print()
        print('File', i)
        print(input_file)
        
        # Create an output file every <size> file
        output_file = get_output_root(partition)+'-block-'+str(i//block_size)+'.csv.bz2'
        
        if os.path.exists(output_file):
            print('Output file', output_file, 'already exists.')
            continue

        users = pd.concat([users, get_users_and_locations(input_file)])
        print('# Users Location:', users.shape[0])
        
        # If Next Index is Multiple of <size> or Reading Last File
        if not (i+1)%block_size or i==len(input_files)-1:
            
            print('Save Output File:', output_file)
            users.to_csv(output_file, sep=',', line_terminator='\n')
            
            del users
            users = pd.DataFrame()  

        end = timer()
        print('Computing Time:', round(end - start), 'Sec')

    return 0

In [7]:
start = timer()

if __name__ == "__main__":
    main()
    
end = timer()
print()
print('Total Computing Time:', round(end - start), 'Sec')


File 0
../data/decahose/json/tweets.json.2016-12-23.xz
# Users Location: 268413
Save Output File: ../data/decahose/parsed/users/users-from-decahose-partition-0-block-0.csv.bz2
Computing Time: 32 Sec

Total Computing Time: 32 Sec


In [8]:
pd.read_csv(
glob(get_output_root(partition)+'*'+'.csv.bz2')[0], 
index_col=0, 
sep=',', 
dtype=object, 
na_filter=False, 
lineterminator='\n').head()

Unnamed: 0,USER ID,USER LOCATION
0,23192615,REDMATRIX
1,145086089,España
2,1385668339,Puerto Rico [ U.S.A. ]
3,319184943,Africa W. The Lions
4,726907747493617665,"Miami, FL"
