In [2]:
import os
import sys
from timeit import default_timer as timer
import socket

from pyspark.sql.functions import udf, desc, row_number, col
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql import SparkSession
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

# Config

In [4]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "join-users-by-account-location").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "join-users-by-account-location").getOrCreate()
    
# Local
print('Hostname:', socket.gethostname())
if 'samuel' in socket.gethostname().lower():
    path_to_users = '../data/decahose/parsed/users/user-ids-account-location-partitioned/'
# Cluster
else:
    path_to_users = '/user/spf248/twitter/data/decahose/parsed/users/user-ids-account-location-partitioned/'

Hostname: Samuels-MacBook-Pro.local


# Import Data

In [5]:
print('Import')
df = spark.read.parquet(path_to_users)

Import


AnalysisException: 'Path does not exist: file:/Users/samuelfraiberger/Dropbox/Work/Projects/twitter/data/decahose/parsed/users/user-ids-account-location-partitioned;'

In [7]:
print('Union Sets of Users by Account Location:')
df = df.groupby("user_location").agg(F.sum("user_id").alias('user_id'))

Group Lists of Users by Account Location:


In [None]:
print('# Locations:', df.count())

In [8]:
# print(df.show(50))

+--------------------+--------------------+
|       user_location|             user_id|
+--------------------+--------------------+
|  Caracas, Venezuela|[122252122, 41893...|
|    الدوحة, دولة قطر|[8111604633126174...|
|      Arequipa, Peru|[2331191766, 2288...|
| Brisbane, Australia|[305328362, 21836...|
|            England |[634877011, 69852...|
|     Gainesville, FL|[4628365815, 1010...|
|        Dos Hermanas|[174784862, 16563...|
|                Utah|[156336275, 19429...|
| Bayern, Deutschland|[3390705670, 7508...|
|Ferrara, Emilia R...|[748290951030902784]|
|        Virginia USA|[46503682, 289866...|
|    Florencio Varela|[364920202, 23436...|
|           Campinas |[135674103, 37131...|
|    College Park, GA|[2388206209, 8053...|
|       Jonesboro, AR|[370357130, 26916...|
|       St George, UT|[1079062922, 8575...|
|             türkiye|[195472635, 36772...|
|            yokohama|[1699442040, 1255...|
|      Luton, England|[31709525, 989204...|
|       Côte d'Ivoire|[257021832

In [9]:
print('SAVE TO PARQUET')
start = timer()
df.write.mode("overwrite").parquet(path_to_users+'../user-ids-by-account-location')
end = timer()
print('DONE IN', round(end - start), 'SEC')

SAVE TO PARQUET
DONE IN 41 SEC
