In [1]:
import os
import socket
from timeit import default_timer as timer
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
try:
    spark
except NameError:
    spark = SparkSession.builder.appName("").getOrCreate()

In [5]:
country_code="US"
language_code="en"
print('Country:',country_code)
print('Language:',language_code)
labels = ['loss', 'unemployed', 'search', 'hire', 'offer']
path_to_data = '/user/spf248/twitter/data'
path_to_keywords = os.path.join(path_to_data,'keywords','labor',country_code)
path_to_predictions = os.path.join(path_to_data,'classification',country_code,'keywords')

Country: US
Language: en


In [None]:
hadoop = spark.sparkContext._jvm.org.apache.hadoop
fs = hadoop.fs.FileSystem
conf = hadoop.conf.Configuration() 
timeline_paths = hadoop.fs.Path(os.path.join(path_to_data,'timelines','*','extract',country_code,'*','*','*.snappy.orc'))
timeline_paths = [ str(f.getPath()).replace('hdfs://dumbo','') for f in fs.get(conf).globStatus(timeline_paths) ]
print('Files:', len(timeline_paths))

In [None]:
df=spark.read.orc(timeline_paths)
df=df.drop_duplicates(subset=['tweet_id'])
df=df.where(df.tweet_lang==language_code)
df=df.where(~df.text.contains('RT @'))
df=df.withColumn('text',F.lower(F.col('text')))
df=df.withColumn('year',F.year('created_at').cast("string"))
df=df.withColumn('month',F.month('created_at').cast("string"))
print('Tweets:',df.count())
print('Users:',df.select('user_id').distinct().count())

In [None]:
for label in labels:
    targets=sorted(set(spark.read.csv(os.path.join(path_to_keywords,country_code+'-'+label+'.txt')).toPandas()['_c0']))
    regex="|".join(["(" + target.lower() + ")" for target in targets])
    df=df.withColumn(label,F.col('text').rlike(regex).cast("int"))
df=df.drop(*['text','tweet_lang','place_id','tweet_longitude','tweet_latitude'])

In [None]:
start = timer()
df.write.partitionBy("year", "month").mode("overwrite").format("orc").save(path_to_predictions)
end = timer()
print('DONE IN', round(end - start), 'SEC')