In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime
from glob import glob

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,when,count,col,count,lit,sum
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from py4j.java_gateway import java_import
from functools import reduce
from pyspark.sql import DataFrame

# Config

In [2]:
country_code = "US"
print('Country:', country_code)

Country: US


In [9]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark=SparkSession.builder.config("spark.driver.host", "localhost").appName("extract-timelines").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark=SparkSession.builder.appName("extract-timelines").getOrCreate()
        
# IgnoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
    
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/user/spf248/twitter/data'

Hostname: Samuels-MacBook-Pro.local


# Process Timelines

In [None]:
print('IMPORT')
timelines=spark.read.parquet(os.path.join(path_to_data,'timelines','chunks',country_code,'*/*.parquet'))

print('REPARTITION')
timelines=timelines.repartition(1000)

# timelines.cache()

print("DROP DUPLICATE IDS")
timelines=timelines.drop_duplicates(subset=['tweet_id'])

print("LIST USERS WITH TIMELINES")
users=timelines.select("user_id").distinct()

users.cache()

In [None]:
print('# STATUSES:', timelines.count())

print('# USERS:', users.count())

def count_not_null(c, nan_as_null=False):
    """Use conversion between boolean and integer
    - False -> 0
    - True ->  1
    """
    pred = col(c).isNotNull() & (~isnan(c) if nan_as_null else lit(True))
    return sum(pred.cast("integer")).alias(c)

print('COUNT VALUES THAT ARE NON-NULL AND NON-NAN')
print(timelines.agg(*[count_not_null(c, True) for c in [
'tweet_id',
'text',
'tweet_lang',
'user_id',
'user_location',
'place_id',
'tweet_longitude',
'tweet_latitude',
]]).show())

# Join Decahose Tweets

In [None]:
tweets=spark.read.parquet(os.path.join(path_to_data,'tweets/tweets-with-identified-location',country_code))

tweets=tweets.repartition(1000)

# tweets.cache()

print('# TWEETS:', tweets.count())

tweets=tweets.join(users,on='user_id')

print('# TWEETS OF PANEL USERS:',tweets.count())

df=timelines.unionByName(tweets).drop_duplicates(subset=['tweet_id'])

print('# STATUSES:', df.count())

In [None]:
print('SAVE')
start = timer()

df.write.mode("overwrite").parquet(os.path.join(path_to_data,'timelines','extract',country_code))

end = timer()
print('DONE IN', round(end - start), 'SEC')

# Split By Month and Year

In [6]:
# def month_year_iter( start_month, start_year, end_month, end_year ):
#     ym_start= 12*start_year + start_month - 1
#     ym_end= 12*end_year + end_month - 1
#     for ym in range( ym_start, ym_end ):
#         y, m = divmod( ym, 12 )
#         yield y, m+1

In [None]:
# print("EXTRACT YEAR AND MONTH")
# df=df.withColumn('year',year('created_at').cast("string"))
# df=df.withColumn('month',month('created_at').cast("string"))

In [None]:
# print('SAVE')
# start = timer()

# dates=list(month_year_iter(1,2012,1,2020))

# for i in range(len(dates)-1):
    
#     df.filter((df["year"]==dates[i][0])&(df["month"]==dates[i][1])).drop('year','month').coalesce(1).write.mode(
#     "overwrite").parquet(os.path.join(path_to_data,'timelines','extract',country_code,str(dates[i][0])+'-'+str(dates[i][1])))

# end = timer()
# print('DONE IN', round(end - start), 'SEC')

Country: US
Create Cluster SparkSession
Hostname: compute-1-7.local
IMPORT
REPARTITION
DROP DUPLICATE IDS
LIST USERS WITH TIMELINES
STATUSES: 4550792893
USERS: 5770200
COUNT VALUES THAT ARE NON-NULL AND NON-NAN
+----------+----------+----------+----------+-------------+---------+---------------+--------------+
|  tweet_id|      text|tweet_lang|   user_id|user_location| place_id|tweet_longitude|tweet_latitude|
+----------+----------+----------+----------+-------------+---------+---------------+--------------+
|4550792893|4550792891|4550792893|4550792893|   4550792893|159080591|       61041033|      61041033|
+----------+----------+----------+----------+-------------+---------+---------------+--------------+

TWEETS: 3908098415
TWEETS OF PANEL USERS: 481753540
STATUSES: 4885224647
SAVE
DONE IN 13672 SEC
Computing Time: 0.18

Country: AR
Create Cluster SparkSession
Hostname: compute-1-9.local
IMPORT
REPARTITION
DROP DUPLICATE IDS
LIST USERS WITH TIMELINES
STATUSES: 2038466535
USERS: 2353925
COUNT VALUES THAT ARE NON-NULL AND NON-NAN
+----------+----------+----------+----------+-------------+--------+---------------+--------------+
|  tweet_id|      text|tweet_lang|   user_id|user_location|place_id|tweet_longitude|tweet_latitude|
+----------+----------+----------+----------+-------------+--------+---------------+--------------+
|2038466535|2038466534|2038466535|2038466535|   2038466535|49475184|       14739186|      14739186|
+----------+----------+----------+----------+-------------+--------+---------------+--------------+

TWEETS: 468879229
TWEETS OF PANEL USERS: 194885800
STATUSES: 2181644169
SAVE
DONE IN 817 SEC


Country: CO
Create Cluster SparkSession
Hostname: compute-2-5.local
IMPORT
REPARTITION
DROP DUPLICATE IDS
LIST USERS WITH TIMELINES
STATUSES: 845684609
USERS: 1560089
COUNT VALUES THAT ARE NON-NULL AND NON-NAN
+---------+---------+----------+---------+-------------+--------+---------------+--------------+
| tweet_id|     text|tweet_lang|  user_id|user_location|place_id|tweet_longitude|tweet_latitude|
+---------+---------+----------+---------+-------------+--------+---------------+--------------+
|845684608|845684607| 845684608|845684608|    845684608|20974654|       11754234|      11754234|
+---------+---------+----------+---------+-------------+--------+---------------+--------------+

TWEETS: 190026981
TWEETS OF PANEL USERS: 83965996
STATUSES: 898531831
SAVE
DONE IN 489 SEC