Reference: 
    
https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object

In [2]:
import re
import os
import sys
import socket
from glob import glob
import numpy as np
import lzma
import ujson as json
import pandas as pd
from datetime import datetime
from timeit import default_timer as timer

In [3]:
# data_type = 'users'
data_type = 'tweets'
print('Type of Data Collected:', data_type)

# get_data = 'get_user_id_and_location'
# get_data = 'get_tweets_with_geocoordinates_or_place'
get_data = 'get_tweets_with_identified_location'
print('Function Used To Collect Data:', get_data)

ext = '.csv.bz2'
# ext = '.json.bz2'
print('Output Files Extension:', ext)

block_size = 3
print('# Input File Parsed per Output File:', block_size)

path_to_locations='../../data/locations/profiles'

# Check The Decahose Files Integrity
date_min = datetime(year=2011, month=1, day=1).date()
date_max = datetime(year=2018, month=12, day=31).date()
# date_max = datetime(year=2019, month=6, day=30).date()
n_files  = 2815 # 2974

Type of Data Collected: tweets
Function Used To Collect Data: get_tweets_with_identified_location
Output Files Extension: .csv.bz2
# Input File Parsed per Output File: 1


In [5]:
identified_locations = frozenset(pd.read_csv(os.path.join(path_to_locations,'account-locations-identified.csv')['LOCATION'])
print('# Identified Locations:', len(identified_locations))

# Identified Locations: 39779


In [24]:
hostname2partition={
# 'achtung02':,
'achtung03':0,
'achtung04':1,
'achtung05':2,
'achtung06':3,
'achtung07':4,
'achtung08':5,
'achtung09':6,
# 'achtung10':,
# 'achtung11':,
'achtung12':7,
'achtung13':8,
'achtung14':9,
'achtung15':10,
'achtung16':11,
'achtung17':12,
}
print('# Partitions:', len(hostname2partition))

hostname = socket.gethostname()
print('Hostname:', hostname)

partition = hostname2partition.get(hostname,0)
print('Index Partition:', partition)

# Partitions: 16
Hostname: FAC38c9860d5a89
Index Partition: 0


In [25]:
def get_date_from_filename(filename):
    
    match = re.search(r'\d{4}-\d{2}-\d{2}', filename)
    
    return datetime.strptime(match.group(), '%Y-%m-%d').date()

# get_date_from_filename(input_files[0])

In [26]:
# Create Array of Input Files
def get_input_files(partition,date_min=date_min,date_max=date_max,n_files=n_files):
    
    # Tweets Stored on Cluster 
    if os.path.exists('/net/twitter/gardenhose-data/json/'):
        path_to_input_files  = '/net/twitter/gardenhose-data/json/'
    # For testing
    elif os.path.exists('../../data/decahose/'):
        path_to_input_files  = '../../data/decahose/'
    else:
        sys.exit('Incorrect working directory...exiting.')
        
    input_files = sorted(glob(path_to_input_files+'tweets.json.*.xz'))
    
    input_files = [input_file for input_file in input_files if 
                   get_date_from_filename(input_file)>=date_min and
                   get_date_from_filename(input_file)<=date_max]

    if len(input_files) != n_files:
        sys.exit('Check input files...exiting.')
    
    np.random.seed(0)
    input_files = np.random.permutation(input_files)
    
    print('# Input Files:', len(input_files))
    print('First File:', input_files[0])
    print('Last File:', input_files[-1])
    print()
    
    partitioned_files = np.array_split(input_files, len(hostname2partition))
    input_files = partitioned_files[partition]
    
    print('Partition List of Input Files:')
    print('# Files:', len(input_files))
    print('First File:', input_files[0])
    print('Last File:', input_files[-1])

    return input_files

input_files = get_input_files(partition)

# Input Files: 1
First File: ../data/decahose/json/tweets.json.2016-12-23.xz
Last File: ../data/decahose/json/tweets.json.2016-12-23.xz

Partition List of Input Files:
# Files: 1
First File: ../data/decahose/json/tweets.json.2016-12-23.xz
Last File: ../data/decahose/json/tweets.json.2016-12-23.xz


In [27]:
# Create Path to Output File
def get_output_root(partition,data_type,get_data):
    
    path_to_output_files = '../data/decahose/'+data_type+'/'
    
    os.makedirs(path_to_output_files, exist_ok=True)
    
    output_file = get_data.replace('get_','').replace('_','-')+'-from-decahose-partition-'+str(partition)
        
    return path_to_output_files+output_file

print('Output Root:')
print(get_output_root(partition,data_type,get_data))

Output Root:
../data/decahose/parsed/tweets/tweets-with-identified-location-from-decahose-partition-0


In [28]:
def get_tweets_with_identified_location(input_file):
    
    fields = [
    'created_at',
    'id_str',
    'lang',
    ]

    fields_user = [
    'id_str',
#     'name',
#     'screen_name',
    'location',
#     'description',
#     'created_at',
#     'profile_image_url_https',
#     'default_profile_image',
#     'time_zone',
    ]

    cols = fields+['text','place_id','tweet_longitude','tweet_latitude']+['user_'+x for x in fields_user]

    tweets = []

    with lzma.open(input_file,'rb') as f:

        for i,line in enumerate(f):

            # Only Select Tweets With Account Location (Could Be in the Replies)
            if b'"location":"' in line:
                
                # Json Parsing Can Fail
                try:
                    
                    tweet = json.loads(line)
                    
                    # Only Select Tweets With Identified Location
                    if tweet.get('user', {}).get('location',None) in identified_locations:
            
                        tweets.append(
                        [tweet.get(field, None) for field in fields]+\
                        [tweet['extended_tweet']['full_text'] if tweet['truncated'] else tweet['text']]+\
                        [tweet['place']['id'] if tweet['place'] else None]+\
                        [tweet['coordinates']['coordinates'][0] if tweet['coordinates'] else None]+\
                        [tweet['coordinates']['coordinates'][1] if tweet['coordinates'] else None]+\
                        [tweet['user'].get(field, None) for field in fields_user]
                        )

                except:
                    continue
                
    return pd.DataFrame(tweets, columns=cols)

In [29]:
# There Could Be Some Duplicated Rows
def get_user_id_and_location(input_file):
    
    tweets = []

    with lzma.open(input_file,'rb') as f:

        for i,line in enumerate(f):

            # Only Select Tweets With Account Location (Could Be in the Replies)
            if b'"location":"' in line:

                # Json Parsing Can Fail
                try:
                    
                    tweet = json.loads(line.decode("utf-8"))
                    
                    tweets.append([
                    tweet.get('user', {}).get('id_str',None), 
                    tweet.get('user', {}).get('location',None),
                    ])

                except:
                    
                    continue
                
#             if i == 100000:
#                 break
                
    return pd.DataFrame(tweets,columns=['USER ID','USER LOCATION'])

In [30]:
def get_tweets_with_geocoordinates_or_place(input_file):
    
    tweets = []

    with lzma.open(input_file,'rb') as f:

        for i,line in enumerate(f):
            
            # Only Select Tweets With Geocoordinates (Could Be in the Replies)
            if b'"coordinates":{' in line or b'"place":{' in line:

                # Json Parsing Can Fail
                try:
                    
                    tweet = json.loads(line.decode("utf-8"))
                    
                    # Only Collect If Selected Data in the Original Tweet (Not RT etc.)
                    if tweet.get('coordinates',None) or tweet.get('place',None):
                    
                        tweets.append(tweet)

                except:
                    
                    continue
                
#             if i == 100000:
#                 break
                
    return pd.DataFrame(tweets)

In [31]:
def main():

    tweets = pd.DataFrame()

    for i, input_file in enumerate(input_files):

        start = timer()
        print()
        print('File', i)
        print(input_file)
        
        # Create an output file every <size> file
        output_file = get_output_root(partition,data_type,get_data)+'-block-'+str(i//block_size)+ext
        
        if os.path.exists(output_file):
            print('Output file', output_file, 'already exists.')
            continue

        tweets = pd.concat([tweets, eval(get_data)(input_file)])
        print('# Tweets:', tweets.shape[0])
        
        # Save if Next Index is a Multiple of <size> or Reading Last File
        if not (i+1)%block_size or i==len(input_files)-1:
            
            print('Save Output File:', output_file)
            
            if 'csv' in ext:
                
                tweets.to_csv(
                output_file, 
                sep=',', 
                line_terminator='\n')
                
                # pd.read_csv(
                # output_file, 
                # index_col=0, 
                # sep=',', 
                # dtype=object, 
                # na_filter=False,
                # lineterminator='\n')
                
            elif 'json' in ext:
                
                tweets.to_json(
                output_file,
                orient='records',
                force_ascii=False,
                date_format=None,
                double_precision=15)
                
                # pd.read_json(output_file,
                # orient='records',
                # dtype=False,
                # convert_dates=False)
                
            else:
                
                sys.exit('Extension error... exiting.')
            
            del tweets
            tweets = pd.DataFrame()  

        end = timer()
        print('Computing Time:', round(end - start), 'Sec')

    return 0

In [32]:
start = timer()

if __name__ == "__main__":
    main()
    
end = timer()
print()
print('Total Computing Time:', round(end - start), 'Sec')


File 0
../data/decahose/json/tweets.json.2016-12-23.xz
# Tweets: 102024
Save Output File: ../data/decahose/parsed/tweets/tweets-with-identified-location-from-decahose-partition-0-block-0.csv.bz2
Computing Time: 43 Sec

Total Computing Time: 43 Sec


In [13]:
# data = pd.read_json(
# glob(get_output_root(partition,data_type,get_data)+'*'+ext)[0],
# orient='records',
# dtype=False,
# convert_dates=False)

In [33]:
# data = pd.read_csv(
# glob(get_output_root(partition,data_type,get_data)+'*'+ext)[0], 
# index_col=0, 
# sep=',', 
# dtype=object, 
# na_filter=False,
# lineterminator='\n')