In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
from timeit import default_timer as timer
from datetime import datetime
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lower
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [2]:
print('Hostname:', socket.gethostname())
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "sample-tweets").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "sample-tweets").getOrCreate()

Hostname: Samuels-MacBook-Pro.local


In [3]:
country_code = "US"
language_code = "en"
print('Country:', country_code)
print('Language:', language_code)

# Local
if  'samuel' in socket.gethostname().lower():
    path_to_tweets='../../data/tweets/tweets-with-identified-location/extract/'+country_code+'/'
    path_to_classification=os.path.join('../../data/classification',country_code)
# Cluster
else:
    path_to_tweets='/user/spf248/twitter/data/decahose/parsed/tweets/tweets-with-identified-location/extract/'+country_code+'/'
    path_to_classification=os.path.join('/user/spf248/twitter/data/classification',country_code)
    
keywords = {
'en':frozenset(['job','position','work','fired','laid off','quit','unemployed','hired']),
}[language_code]

print("Keywords:", ', '.join(keywords))

Country: US
Language: en
Keywords: work, hired, position, job, quit, unemployed, laid off, fired


In [4]:
print('Import Dataset')
df = spark.read.parquet(path_to_tweets)
df.cache()

Import Dataset


In [5]:
n_tweets = df.count()
print('# Tweets:', n_tweets)
print('%',language_code,'tweets:',round(df.where(df.tweet_lang==language_code).count()/n_tweets,2))

# Tweets: 19540979
% en tweets: 0.63


In [6]:
print(df.groupBy('tweet_lang').count().orderBy(desc('count')).show(10))

+----------+--------+
|tweet_lang|   count|
+----------+--------+
|        en|12228481|
|      null| 5319767|
|       und|  811966|
|        es|  271686|
|        pt|  109743|
|        tl|   82289|
|        fr|   65623|
|        in|   54376|
|        de|   53839|
|        ar|   50776|
+----------+--------+
only showing top 10 rows

None


In [7]:
# Select tweet columns
df = df.select('tweet_id','text','tweet_lang')

# Keep language-specific tweets
df = df.where(df.tweet_lang==language_code).drop('tweet_lang')

# Check keyword presence
df = df.withColumn('text_lowercase', lower(col('text')))
for keyword in sorted(keywords):
    df = df.withColumn(keyword.replace(' ','_'), df.text_lowercase.contains(keyword))
df = df.drop('text_lowercase')

# Create keyword indicator
df = df.withColumn('keyword', sum(df[keyword.replace(' ','_')].cast("int") for keyword in keywords)>0)

# Create sample of tweets matching at least one keyword
df_filtered = df.filter(df.keyword==True)

In [8]:
print('Count Tweets')
n_filtered = df_filtered.count()
n_lang = df.count()
print('# English Tweets:', n_lang)
print('# English Tweets Matching Keyword:', n_filtered, '(',round(n_filtered/n_lang*100,2),'% )')

Count Tweets
# English Tweets: 12228481
# English Tweets Matching Keyword: 458869 ( 3.75 % )


In [9]:
# Take a random sample of tweets of same size
df_random = df.sample(False, n_filtered/n_lang, seed=0)

In [10]:
print('Save')
df_filtered.write.mode("overwrite").parquet(os.path.join(path_to_classification,'filtered'))
df_random.write.mode("overwrite").parquet(os.path.join(path_to_classification,'random'))

Save


In [15]:
(1576858766494-1576857739492)/(3600*1000)

0.28527833333333336

Country: US

Language: en

Keywords: work, hired, position, job, quit, unemployed, laid off, fired

Import Dataset

N Tweets: 3908098415

% en tweets: 0.63

+----------+----------+
|tweet_lang|     count|
+----------+----------+
|        en|2445518357|
|      null|1064221347|
|       und| 162239679|
|        es|  54245897|
|        pt|  21968344|
|        tl|  16450819|
|        fr|  13063923|
|        in|  10866923|
|        de|  10786388|
|        ar|  10121259|
+----------+----------+

N English Tweets: 2445518357

N English Tweets Matching Keyword: 92121093 ( 3.77 % )