In [1]:
import os
import sys
import lzma
import ujson as json
import numpy as np
import pandas as pd
import multiprocessing as mp
import re
from timeit import default_timer as timer

In [2]:
######################## Params #############################

version_tweets     = 'v6'
version_locations  = 'v5'

path_to_input_files  = './data/decahose/json/'
path_to_input_files  = '/net/twitter/gardenhose-data/json/'
path_to_output_files = './data/decahose/parsed/tweets/tweets-'+version_tweets+'/'
path_to_locations    = './data/decahose/parsed/locations/'
    
years  = ["%.2d" % i for i in range(2012,2019)]
months = ["%.2d" % i for i in range(1,13)]
# years  = ['2017']
# months = ['01']
    
#############################################################   

In [3]:
# Select Input Files
def get_input_files(path_to_input_files,year=None,month=None):

    input_files = [file for file in os.listdir(path_to_input_files) if file[-3:]=='.xz']
    
    if year:
        input_files = [file for file in input_files if file.replace('.','-').split('-')[2]==year]

    if month:
        input_files = [file for file in input_files if file.replace('.','-').split('-')[3]==month]

    # Randomize For Parallel Processing
    return np.random.permutation(input_files)

In [4]:
# Get Output File
def get_output_file(version_tweets,year=None,month=None):
    
    output_file = 'tweets-from-decahose'
    
    if year:
        output_file += '-year-'+str(year)
        
    if month:
        output_file += '-month-'+str(month)
        
    output_file+='-'+version_tweets+'.pkl'
    
    if not os.path.exists(path_to_output_files):
        os.mkdir(path_to_output_files)

    return output_file

In [5]:
# Get Sorted List of Selected Locations
def get_locations(version_locations):
    
    return np.sort(pd.read_pickle(
    path_to_locations+'locations-'+version_locations+'.pkl').index.get_level_values(1).unique())

locations = get_locations(version_locations)

In [12]:
def parse_tweets(xz_file):

    tweets = []

    columns = [
    'TIME',
    'ID',
    'TEXT',
    'LANG',
    'LAT',
    'LON',
    'PLACE',
    'USER ID',
    'USER LOCATION',
#     'USER UTC OFFSET',
#     'USER TIME ZONE',
#     'USER DESCRIPTION',
#     'USER IMAGE URL',
    ]

    with lzma.open(path_to_input_files+xz_file,'rb') as f:

        for line in f:

            if b',"location":' not in line:
                continue

            # Json Parsing Can Fail
            try:

                # Encoding Seems to Be Automatically Detected. 
                tweet = json.loads(line)
                
                location = tweet.get('user', {}).get('location',None)
                
                if not location:
                    continue
                
                if location not in locations:
                    continue

                text = tweet.get('text',None)
                extended_text = tweet.get('extended_tweet', {}).get('full_text', None)
                if extended_text:
                    text = extended_text
                    
                lat = None
                lon = None
                if tweet.get('coordinates', None):
                    lat = tweet['coordinates']['coordinates'][1]
                    lon = tweet['coordinates']['coordinates'][0]
                    
                tweets.append([
                pd.to_datetime(tweet.get('created_at',None)),
                tweet.get('id',None),
                text,
                tweet.get('lang',None),
                lat,
                lon,
                tweet.get('place', None),
                tweet.get('user', {}).get('id',None),
                location,
#                 tweet.get('user', {}).get('utc_offset',None),
#                 tweet.get('user', {}).get('time_zone',None),
#                 tweet.get('user', {}).get('description',None),
#                 tweet.get('user', {}).get('profile_image_url',None),
                ])

            except:
                continue
                            
    return pd.DataFrame(tweets, columns=columns).set_index('ID')

In [13]:
def main():
    
    for year in years:
        
        for month in months:
            
            start = timer()
            print('Year:', year)
            print('Month:', month)

            input_files = get_input_files(path_to_input_files,year,month)
            print('# Input Files:', len(input_files))
            
            output_file = get_output_file(version_tweets,year,month)
            print('Output File:', output_file)

            if not len(input_files):
                
                print('No Input File.')
                print()
                continue
                
            if os.path.exists(path_to_output_files+output_file):
                
                print('Output File Already Exists.')
                print()
                continue
    
            print('Parse Tweets...')
            with mp.Pool() as pool:
                tweets = pd.concat(pool.map(parse_tweets, input_files))
            print('# Tweets:', tweets.shape[0])

            print('Save Tweets...')
            tweets.to_pickle(path_to_output_files+output_file)
            del tweets
            print('Done!')

            end = timer()
            print('Computing Time:', round(end - start), 'sec')
            print()
                
    return 0

In [14]:
main()

Year: 2012
Month: 01
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-01-v6.pkl
No Input File.

Year: 2012
Month: 02
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-02-v6.pkl
No Input File.

Year: 2012
Month: 03
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-03-v6.pkl
No Input File.

Year: 2012
Month: 04
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-04-v6.pkl
No Input File.

Year: 2012
Month: 05
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-05-v6.pkl
No Input File.

Year: 2012
Month: 06
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-06-v6.pkl
No Input File.

Year: 2012
Month: 07
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-07-v6.pkl
No Input File.

Year: 2012
Month: 08
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-08-v6.pkl
No Input File.

Year: 2012
Month: 09
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-

0

In [15]:
# tweets = pd.read_pickle(
# './data/decahose/parsed/geolocated-tweets-'+version_tweets+'/tweets-from-decahose-year-2016-month-12-'+version_tweets+'.pkl')