In [None]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,when,count,col,count,lit,sum
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from py4j.java_gateway import java_import
from functools import reduce
from pyspark.sql import DataFrame

# Config

In [None]:
country_code = "US"
print('Country:', country_code)

In [None]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark=SparkSession.builder.config("spark.driver.host", "localhost").appName("extract-timelines").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark=SparkSession.builder.appName("extract-timelines").getOrCreate()
        
# IgnoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
    
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/user/spf248/twitter/data'

In [None]:
print('IMPORT USERS PROFILE')
users_profile=spark.read.json(os.path.join(path_to_data,'users','users-profile'))
users_profile=users_profile.select('id_str','statuses_count')
users_profile=users_profile.toDF(*['user_id','statuses_count'])
users_profile.cache()

In [None]:
print('IMPORT TIMELINES')
timelines=spark.read.parquet(os.path.join(path_to_data,'timelines','chunks',country_code,'*/*.parquet'))
timelines=timelines.repartition(1000)
timelines=timelines.drop_duplicates(subset=['tweet_id'])

print('COUNT TWEETS PER USER')
counts_timelines=timelines.groupby("user_id").agg({'tweet_id':'count'})
counts_timelines=counts_timelines.toDF(*['user_id','timeline_count'])
counts_timelines.cache()
print('# TIMELINE COUNTS:', counts_timelines.count())

In [None]:
print('IMPORT TWEETS')
tweets=spark.read.parquet(os.path.join(path_to_data,'tweets/tweets-with-identified-location',country_code))
tweets=tweets.repartition(1000)

print('MERGE DECAHOSE TWEETS AND TIMELINES')
tweets=tweets.join(counts_timelines.select('user_id'),on='user_id')
combined=timelines.unionByName(tweets).drop_duplicates(subset=['tweet_id'])

print('COUNT TWEETS PER USER')
counts_combined=combined.groupby("user_id").agg({'tweet_id':'count'})
counts_combined=counts_combined.toDF(*['user_id','combined_count'])
counts_combined.cache()
print('# COMBINED COUNTS:', counts_combined.count())

In [None]:
print('SAVE')
start = timer()

df=users_profile.join(counts_timelines,on='user_id').join(counts_combined,on='user_id')
df.write.mode("overwrite").parquet(os.path.join(path_to_data,'timelines','counts',country_code))

end = timer()
print('DONE IN', round(end - start), 'SEC')