In [2]:
import os
import sys
from timeit import default_timer as timer
import socket
import pandas as pd

from pyspark.sql.functions import udf, desc, row_number, col
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql import SparkSession
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [3]:
(1576713912537-1576708557006)/(1000*3600)

1.4876475

# Config

In [15]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-users-by-account-location").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-users-by-account-location").getOrCreate()
    
# Local
print('Hostname:', socket.gethostname())
if 'samuel' in socket.gethostname().lower():
    path_to_users     = '../../data/users/user-id-and-location/'
    path_to_locations = '../../data/locations/profiles/'
# Cluster
else:
    path_to_users     = '/user/spf248/twitter/data/users/user-id-and-location/'
    path_to_locations = '/user/spf248/twitter/data/locations/profiles/'    

Hostname: Samuels-MBP


# Import Data

In [22]:
print('Import User Id and Location:')

schema = StructType([StructField('_c0', StringType(), True),
                     StructField('USER ID', StringType(), True),
                     StructField('USER LOCATION', StringType(), True)])

df = spark.read.option(
'compression', 'bzip2').option(
'header','true').option(
"multiLine", "true").option(
"mode", "DROPMALFORMED").option(
'escape','"').option(
"encoding", "UTF-8").schema(schema).csv(
path_to_users+'user-id-and-location-from-decahose-partition-*-block-*.csv.bz2')

df = df.drop('_c0')
df = df.toDF(*['user_id','user_location'])

Import User Id and Location:


In [None]:
print('REPARTITION')
df = df.repartition(1000)

In [None]:
print('CACHE')
df.cache()

In [4]:
print('Import Identified Locations')
identified_locations = spark.read.option(
'header','true').option(
"multiLine", "true").csv(
path_to_locations+'account-locations.csv').select('user_location')

Import Identified Locations


# Process Data

In [5]:
print('Select Rows in Identified Locations')
df = df.join(identified_locations, on=['user_location'], how='inner')

Select Rows in Identified Locations


In [6]:
print('Select Most Frequent Location Per User')
df = df.groupBy('user_id', 'user_location').count()
window = Window.partitionBy("user_id").orderBy(desc("count"))
df = df.withColumn('order', row_number().over(window)).where(col('order') == 1).drop(*['count','order'])

Select Most Frequent Location Per User


In [7]:
print('Group Lists of Users by Account Location:')
df = df.groupby("user_location").agg(F.collect_set("user_id").alias('user_id'))

Group Lists of Users by Account Location:


In [8]:
# print(df.show(50))

+--------------------+--------------------+
|       user_location|             user_id|
+--------------------+--------------------+
|  Caracas, Venezuela|[122252122, 41893...|
|    الدوحة, دولة قطر|[8111604633126174...|
|      Arequipa, Peru|[2331191766, 2288...|
| Brisbane, Australia|[305328362, 21836...|
|            England |[634877011, 69852...|
|     Gainesville, FL|[4628365815, 1010...|
|        Dos Hermanas|[174784862, 16563...|
|                Utah|[156336275, 19429...|
| Bayern, Deutschland|[3390705670, 7508...|
|Ferrara, Emilia R...|[748290951030902784]|
|        Virginia USA|[46503682, 289866...|
|    Florencio Varela|[364920202, 23436...|
|           Campinas |[135674103, 37131...|
|    College Park, GA|[2388206209, 8053...|
|       Jonesboro, AR|[370357130, 26916...|
|       St George, UT|[1079062922, 8575...|
|             türkiye|[195472635, 36772...|
|            yokohama|[1699442040, 1255...|
|      Luton, England|[31709525, 989204...|
|       Côte d'Ivoire|[257021832

In [9]:
print('SAVE')
start = timer()
df.write.mode("overwrite").json(path_to_users+'../user-ids-by-account-location')
end = timer()
print('DONE IN', round(end - start), 'SEC')

SAVE TO PARQUET
DONE IN 41 SEC
