In [None]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [None]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-verified-users-by-acccount-location").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-verified-users-by-acccount-location").getOrCreate()
        
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_data='../../data/users'
# Cluster
else:
    path_to_data='/user/spf248/twitter/data/users'

In [None]:
users_by_account_location=spark.read.json(os.path.join(path_to_data,'user-ids-by-account-location'))
users_by_account_location.cache()

In [None]:
users_profile=spark.read.json(os.path.join(path_to_data,'users-profile'))
users_profile.cache()

In [None]:
# Group users by location
users_profile=users_profile.select(col('location').alias('user_location'),col('id_str').alias('verified_id'))
users_profile=users_profile.groupby("user_location").agg(F.collect_set("verified_id").alias('verified_id'))

In [None]:
# Keep Verified Users At Identified Locations
users=users_by_account_location.join(users_profile,on='user_location')

# Keep Users Who Who Remained At Same Location Between Lookup And Sampling
intersectCols=udf(lambda x, y: list(set(x).intersection(set(y))))
users=users.withColumn("common_id", intersectCols(col("user_id"), col("verified_id")))

# Countr Users
size_=udf(lambda xs: len(xs), IntegerType())
users=users.withColumn('n_users',size_('user_id'))
users=users.withColumn('n_verified',size_('verified_id'))
users=users.withColumn('n_common',size_('common_id'))

In [None]:
print('# Locations:',users_by_account_location.count())
print('# Locations with Verified Users:',users.count())

In [None]:
print('# Users:', 
users_by_account_location.withColumn('n_users',size_('user_id')).select('n_users').groupBy().sum().collect()[0][0])
print('# Verified Users:', 
users_profile.withColumn('n_verified',size_('verified_id')).select('n_verified').groupBy().sum().collect()[0][0])
print('# Verified Users at identified locations:', 
users.select('n_verified').groupBy().sum().rdd.map(lambda x:x).collect()[0][0])
print('# Users who remained at identified locations:', 
users.select('n_common').groupBy().sum().rdd.map(lambda x:x).collect()[0][0])

In [None]:
users=users.selectExpr('user_location','common_id as user_id','n_common as n')
users.coalesce(1).write.mode("overwrite").json(path_to_data+'user-ids-by-account-location-verified')

In [None]:
# Locations: 39779
# Locations with Verified Users: 39779
# Users: 194595170
# Verified Users at identified locations: 107325682
# Users who remained at identified locations: 92088032