In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime
from glob import glob
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lit,lower
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

# Config

In [2]:
country_code = "US"
language_code = "en"
print('Country:', country_code)
print('Language:', language_code)

wordlists=['loss', 'unemployed', 'search', 'hire', 'offer']

Country: US
Language: en


In [3]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-tweets-mentions").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-tweets-mentions").getOrCreate()
    
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
# Cluster
else:
    path_to_data='/user/spf248/twitter/data'
    
path_to_timelines=os.path.join(path_to_data,'timelines','extract',country_code)
path_to_mentions=os.path.join(path_to_data,'mentions')
path_to_keywords=os.path.join(path_to_data,'keywords','labor',country_code)
print(path_to_timelines)
print(path_to_mentions)
print(path_to_keywords)

Hostname: Samuels-MacBook-Pro.local
../../data/timelines/extract/US
../../data/mentions
../../data/keywords/labor/US


# Data Processing

In [4]:
print('Import Datasets')
df=spark.read.parquet(path_to_timelines)

print("REPARTITION")
df=df.repartition(1000)

df.cache()

Import Datasets
REPARTITION


DataFrame[tweet_id: string, created_at: timestamp, text: string, tweet_lang: string, user_id: string, user_location: string, place_id: string, tweet_longitude: double, tweet_latitude: double]

In [5]:
print("REMOVE DUPLICATES")
df=df.drop_duplicates(subset=['tweet_id'])

print("SELECT LANGUAGE")
df=df.where(df.tweet_lang==language_code)

print("EXTRACT YEAR AND MONTH")
df=df.withColumn('year',year('created_at').cast("string"))
df=df.withColumn('month',month('created_at').cast("string"))

print("REMOVE RETWEETS")
df=df.where(~df.text.contains('RT @'))

print("LOWERCASE")
df=df.withColumn('text',lower(col('text')))

SELECT LANGUAGE
EXTRACT YEAR AND MONTH


# Twitter Series

In [41]:
cols=[]
for wordlist in wordlists:
    
    targets=sorted(spark.read.csv(os.path.join(path_to_keywords,country_code+'-'+wordlist+'.txt')).toPandas()['_c0'])
    print('# Targets:', len(targets))
    
    for i_target,target in enumerate(targets):
    
        # Create an indicator equal to one if a tweet contains the target sentence and zero otherwise
        df=df.withColumn(wordlist+'-'+str(i_target),df.text.contains(target.lower()).cast("int"))
        cols.append(wordlist+'-'+str(i_target))
        
    # Create an indicator equal to one if a tweet contains at least one target sentence and zero otherwise
    df=df.withColumn(wordlist,(sum([df.text.contains(target.lower()).cast("int") for target in targets])>0).cast("int"))
    cols.append(wordlist)
    
df=df.drop(*['tweet_id','tweet_lang','created_at','text','place_id','tweet_longitude','tweet_latitude'])

print("COUNT MENTIONS BY YEAR, MONTH, LOCATION, AND USER")
df=df.groupBy('year','month','user_location','user_id').sum()

for name in cols:
    
    # Binarize mention multiplicity at the user level
    df=df.withColumn(name,(df['sum('+name+')']>0).cast("int"))
    df=df.drop('sum('+name+')')
    
print("COUNT USERS BY YEAR, MONTH, LOCATION")
col2func=dict((name,'sum') for name in cols)
col2func.update({'user_id':'count'})
df=df.groupBy('year','month','user_location').agg(col2func)

print("RENAME")
df=df.withColumnRenamed('count(user_id)','n_users')
for name in cols:
    df=df.withColumnRenamed('sum('+name+')','n_'+name)

Most frequent target loss : I just lost my job
Most frequent target unemployed : I wish I had a job
Most frequent target search : I need a job
Most frequent target hire : Starting my new job tomorrow
Most frequent target offer : #hiring


In [123]:
print('SAVE')
start = timer()

df.write.mode("overwrite").parquet(os.path.join(path_to_mentions,country_code))

end = timer()
print('DONE IN', round(end - start), 'SEC')

SAVE
DONE IN 163 SEC


Country: US
Language: en
Hostname: compute-1-14.local
/user/spf248/twitter/data/timelines/extract/US
/user/spf248/twitter/data/mentions
/user/spf248/twitter/data/keywords/labor/US
Import Datasets
REPARTITION
SELECT LANGUAGE
EXTRACT YEAR AND MONTH
COUNT MENTIONS
List: loss -> N_targets: 36
List: unemployed -> N_targets: 16
List: search -> N_targets: 50
List: hire -> N_targets: 29
List: offer -> N_targets: 9
DONE IN 4891 SEC
SAVE
DONE IN 18 SEC
Most frequent mentions:
                        target   count
wordlist                              
hire          I just got hired    1919
loss          I just got fired    1772
offer                  #hiring  796403
search            I need a job   64187
unemployed  I wish I had a job    2584
Most frequent target in loss list : I just got fired
Most frequent target in unemployed list : I wish I had a job
Most frequent target in search list : I need a job
Most frequent target in hire list : I just got hired
Most frequent target in offer list : #hiring
COUNT MENTIONS BY YEAR, MONTH, LOCATION, AND USER
COUNT USERS BY YEAR, MONTH, LOCATION
RENAME
SAVE
DONE IN 444 SEC

Country: CO
Language: es
Create Cluster SparkSession
Hostname: compute-1-14.local
/user/spf248/twitter/data/timelines/extract/CO
/user/spf248/twitter/data/mentions
/user/spf248/twitter/data/keywords/labor/CO
Import Datasets
REPARTITION
REMOVE DUPLICATES
SELECT LANGUAGE
EXTRACT YEAR AND MONTH
COUNT MENTIONS
List: loss -> N_targets: 30
List: unemployed -> N_targets: 14
List: search -> N_targets: 29
List: hire -> N_targets: 39
List: offer -> N_targets: 14
DONE IN 8264 SEC
SAVE
DONE IN 47 SEC
Most frequent mentions:
                            target  count
wordlist                                 
hire        tengo un nuevo trabajo     43
loss          me quede sin trabajo    677
offer                #trabajosihay   7783
search               busco trabajo   3990
unemployed       Estoy sin trabajo   1103
Most frequent target in loss list : me quede sin trabajo
Most frequent target in unemployed list : Estoy sin trabajo
Most frequent target in search list : busco trabajo
Most frequent target in hire list : tengo un nuevo trabajo
Most frequent target in offer list : #trabajosihay
COUNT MENTIONS BY YEAR, MONTH, LOCATION, AND USER
COUNT USERS BY YEAR, MONTH, LOCATION
RENAME
SAVE
DONE IN 136 SEC