In [1]:
import os
import sys
import lzma
import ujson as json
import numpy as np
import pandas as pd
import multiprocessing as mp
from timeit import default_timer as timer

In [None]:
######################## Params #############################

version_tweets = 'v4'
path_to_input_files  = './data/decahose/json/'
# path_to_input_files  = '/net/twitter/gardenhose-data/json/'
path_to_output_files = './data/decahose/parsed/tweets/tweets-'+version_tweets+'/'

if not os.path.exists(path_to_output_files):
    os.mkdir(path_to_output_files)

#############################################################

In [2]:
# Create Output File
def get_output_file(version_tweets,year=None,month=None):
    
    output_file = 'tweets-from-decahose'
    
    if year:
        output_file += '-year-'+str(year)
        
    if month:
        output_file += '-month-'+str(month)
        
    output_file+='-'+version_tweets+'.pkl'
    
    return output_file

# Select Input Files
def get_input_files(path_to_input_files,year=None,month=None):

    input_files = [file for file in os.listdir(path_to_input_files) if file[-3:]=='.xz']
    
    if year:
        input_files = [file for file in input_files if file.replace('.','-').split('-')[2]==year]

    if month:
        input_files = [file for file in input_files if file.replace('.','-').split('-')[3]==month]

    # Randomize For Parallel Processing
    return np.random.permutation(input_files)

def parse_tweets(xz_file):
    
    tweets  = []
    
    columns=[
    'TIME',
    'ID',
    'TEXT',
    'EXTENDED TEXT',
    'LANG',
    'LAT',
    'LON',
    'USER ID',
    'USER LOCATION',
    'USER UTC OFFSET',
    'USER TIME ZONE',
    'USER DESCRIPTION',
    'USER IMAGE URL',
    ]

    with lzma.open(path_to_input_files+xz_file,'rb') as f:

        for line in f:

            # Only Select Tweets With Geocoordinates (Could Be in the Replies)
            if b'"coordinates":{' in line:
                
                # Json Parsing Can Fail
                try:
                    tweet = json.loads(line.decode("utf-8"))
                except:
                    continue

                # Only Selects If Geocoordinates in the Original Tweet (Not RT etc.)
                if tweet.get('coordinates',None):
                    
                    tweets.append([
                    tweet.get('created_at',None),
                    tweet.get('id_str',None),
                    tweet.get('text',None),
                    tweet.get('extended_tweet', {}).get('full_text', None),
                    tweet.get('lang',None),
                    tweet['coordinates']['coordinates'][1],
                    tweet['coordinates']['coordinates'][0], 
                    tweet.get('user', {}).get('id_str',None),
                    tweet.get('user', {}).get('location',None),
                    tweet.get('user', {}).get('utc_offset',None),
                    tweet.get('user', {}).get('time_zone',None),
                    tweet.get('user', {}).get('description',None),
                    tweet.get('user', {}).get('profile_image_url',None),
                    ])
                    
    return pd.DataFrame(tweets, columns=columns)

def main():
    
    years  = ["%.2d" % i for i in range(2012,2019)]
    months = ["%.2d" % i for i in range(1,13)]
    
#     years  = ['2017']
#     months = ['01']
    
    for year in years:
        
        for month in months:
            
            start = timer()
            print('Year:', year)
            print('Month:', month)

            input_files = get_input_files(path_to_input_files,year,month)
            print('# Input Files:', len(input_files))
            
            output_file = get_output_file(version_tweets,year,month)
            print('Output File:', output_file)

            if not len(input_files):
                
                print('No Input File.')
                print()
                continue
                
            if os.path.exists(path_to_output_files+output_file):
                
                print('Output File Already Exists.')
                print()
                continue
    
            print('Parse Tweets...')
            with mp.Pool() as pool:
                tweets = pd.concat(pool.map(parse_tweets, input_files)).reset_index(drop=True)
            print('# Tweets:', tweets.shape[0])

            print('Save Tweets...')
            tweets.to_pickle(path_to_output_files+output_file,compression='xz')
            del tweets
            print('Done!')

            end = timer()
            print('Computing Time:', round(end - start), 'sec')
            print()
                
    return 0

In [3]:
start = timer()

if __name__ == "__main__":
    main()
    
end = timer()
print('Total Computing Time:', round(end - start), 'sec')

Year: 2012
Month: 01
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-01-v4.pkl
No Input File.

Year: 2012
Month: 02
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-02-v4.pkl
No Input File.

Year: 2012
Month: 03
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-03-v4.pkl
No Input File.

Year: 2012
Month: 04
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-04-v4.pkl
No Input File.

Year: 2012
Month: 05
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-05-v4.pkl
No Input File.

Year: 2012
Month: 06
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-06-v4.pkl
No Input File.

Year: 2012
Month: 07
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-07-v4.pkl
No Input File.

Year: 2012
Month: 08
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-08-v4.pkl
No Input File.

Year: 2012
Month: 09
# Input Files: 0
Output File: tweets-from-decahose-year-2012-month-