In [16]:
import os
import sys
import socket
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, from_json
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [23]:
print('Running Time:')
datetime.fromtimestamp(1564019038783/1000) - datetime.fromtimestamp(1564015512745/1000)

Running Time:


datetime.timedelta(0, 3526, 38000)

# Config

In [2]:
try:
    spark
except NameError:
    if socket.gethostname() == 'FAC38c9860d5a89':
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-users-by-account-location").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-users-by-account-location").getOrCreate()
        
# Local
print('Hostname:', socket.gethostname())
if socket.gethostname() == 'FAC38c9860d5a89':
    path_to_tweets    = '../data/decahose/parsed/tweets/'
    path_to_locations = '../data/decahose/parsed/locations/'
# Cluster
else:
    path_to_tweets    = '/user/spf248/twitter/data/decahose/parsed/tweets/'
    path_to_locations = '/user/spf248/twitter/data/decahose/parsed/locations/'

Create Local SparkSession
Hostname: FAC38c9860d5a89


# Load Data

In [3]:
print('Import:')
start = timer()

tweets = spark.read.option('compression', 'bzip2').option(
"multiLine", "true").option(
"encoding", "UTF-8").option(
"mode", "PERMISSIVE").json(
path_to_tweets+'tweets-with-geocoordinates-or-place-from-decahose-partition-*-block-*.json.bz2')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import:
Computing Time: 6 sec


In [None]:
print('Repartition')
tweets = tweets.repartition(500)

# Select Fields

In [4]:
def flatten(schema, prefix=None):
    fields = []
    for field in schema.fields:
        name = prefix + '.' + field.name if prefix else field.name
        dtype = field.dataType
        if isinstance(dtype, ArrayType):
            dtype = dtype.elementType

        if isinstance(dtype, StructType):
            fields += flatten(dtype, prefix=name)
        else:
            fields.append(name)

    return fields

In [5]:
tweets = tweets.select('place')

# Drop Place Values That Are Null
tweets = tweets.where(col("place").isNotNull())

# Flatten Nested Structure
tweets = tweets.select(flatten(tweets.schema))

In [6]:
print("Schema:", tweets.printSchema())

root
 |-- coordinates: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: double (containsNull = true)
 |-- type: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- place_type: string (nullable = true)
 |-- url: string (nullable = true)

Schema: None


In [7]:
print('GROUPBY ID')

tweets = tweets.groupBy(tweets['id']).agg(
F.first(tweets['coordinates']).alias('coordinates'), 
F.first(tweets['type']).alias('type'),
F.first(tweets['country']).alias('country'),
F.first(tweets['country_code']).alias('country_code'),
F.first(tweets['full_name']).alias('full_name'),
F.first(tweets['name']).alias('name'),
F.first(tweets['place_type']).alias('place_type'),
F.first(tweets['url']).alias('url'),
F.count(tweets['id']).alias('n_obs'),
)

GROUPBY ID


In [7]:
print('Save')
start = timer()

tweets.write.mode("overwrite").parquet(path_to_locations+'places.parquet')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Save
Computing Time: 10 sec


In [None]:
print('Done!')
os._exit(1)

# Comparison With Pandas

In [8]:
import pandas as pd

In [9]:
ds = tweets.toPandas()
ds = ds.sort_values(by='id').reset_index(drop=True)

In [12]:
path = '../data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-from-decahose-partition-0-block-0.json.bz2'

dp = pd.read_json(
path,
orient='records',
dtype=False,
precise_float=True,
convert_dates=False)

dp = pd.DataFrame(list(dp['place'].dropna()))

dp = pd.concat([
dp[[
# 'attributes', # empty
'country', 
'country_code', 
'full_name',
'name', 
'place_type', 
'id', 
'url',
# 'bounding_box', 
]],
pd.DataFrame(list(dp['bounding_box']))],1).copy()

dp['n_obs'] = 1

dp=dp.groupby('id',as_index=False).agg(
{'country':'first',
'country_code':'first',
'full_name':'first',
'name':'first',
'place_type':'first',
'url':'first',
'coordinates':'first',
'type':'first',
'n_obs':'sum',
})

In [13]:
ds.head()

Unnamed: 0,id,coordinates,type,country,country_code,full_name,name,place_type,url,n_obs
0,0000321b41466bc8,"[[[-71.381728, 41.914734], [-71.381728, 42.014...",Polygon,United States,US,"North Attleboro, MA",North Attleboro,city,https://api.twitter.com/1.1/geo/id/0000321b414...,2
1,0000940e4aa6e56a,"[[[47.985426, 29.264009], [47.985426, 29.28541...",Polygon,دولة الكويت,KW,"زهراء, دولة الكويت",زهراء,city,https://api.twitter.com/1.1/geo/id/0000940e4aa...,2
2,0000ffd41ff0eaac,"[[[-75.990759, 4.63867], [-75.990759, 4.885687...",Polygon,Colombia,CO,"Pereira, Colombia",Pereira,city,https://api.twitter.com/1.1/geo/id/0000ffd41ff...,1
3,0001ed7e605ce519,"[[[-122.214616, 47.147327], [-122.214616, 47.2...",Polygon,United States,US,"Bonney Lake, WA",Bonney Lake,city,https://api.twitter.com/1.1/geo/id/0001ed7e605...,3
4,00028e77378d6f71,"[[[48.061341, 29.267485], [48.061341, 29.30934...",Polygon,Kuwait,KW,"Salwa, Kuwait",Salwa,city,https://api.twitter.com/1.1/geo/id/00028e77378...,1


In [14]:
dp.head()

Unnamed: 0,id,country,country_code,full_name,name,place_type,url,coordinates,type,n_obs
0,0000321b41466bc8,United States,US,"North Attleboro, MA",North Attleboro,city,https://api.twitter.com/1.1/geo/id/0000321b414...,"[[[-71.381728, 41.914734], [-71.381728, 42.014...",Polygon,2
1,0000940e4aa6e56a,دولة الكويت,KW,"زهراء, دولة الكويت",زهراء,city,https://api.twitter.com/1.1/geo/id/0000940e4aa...,"[[[47.985426, 29.264009], [47.985426, 29.28541...",Polygon,2
2,0000ffd41ff0eaac,Colombia,CO,"Pereira, Colombia",Pereira,city,https://api.twitter.com/1.1/geo/id/0000ffd41ff...,"[[[-75.990759, 4.63867], [-75.990759, 4.885687...",Polygon,1
3,0001ed7e605ce519,United States,US,"Bonney Lake, WA",Bonney Lake,city,https://api.twitter.com/1.1/geo/id/0001ed7e605...,"[[[-122.214616, 47.147327], [-122.214616, 47.2...",Polygon,3
4,00028e77378d6f71,Kuwait,KW,"Salwa, Kuwait",Salwa,city,https://api.twitter.com/1.1/geo/id/00028e77378...,"[[[48.061341, 29.267485], [48.061341, 29.30934...",Polygon,1


In [15]:
print('Check Differences Per Column:')
for col in dp.columns:
    print('Column', col, '-> value differences:', (ds[col].astype(str)!=dp[col].astype(str)).sum())

Check Differences Per Column:
Column id -> value differences: 0
Column country -> value differences: 0
Column country_code -> value differences: 0
Column full_name -> value differences: 0
Column name -> value differences: 0
Column place_type -> value differences: 0
Column url -> value differences: 0
Column coordinates -> value differences: 0
Column type -> value differences: 0
Column n_obs -> value differences: 0
