In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from pyspark.sql.functions import lit

# Config

In [2]:
country_code = "US"
language_code = "en"

In [15]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-tweets-mentions").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-tweets-mentions").getOrCreate()
    
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_tweets   = '../data/decahose/parsed/tweets/tweets-with-identified-location-'+country_code+'/'
#     path_to_tweets   = '../data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-extract/'
    path_to_mentions = '../data/decahose/parsed/tweets/tweets-with-identified-location-mentions/'
    path_to_locations = '../data/decahose/parsed/locations/'
    path_to_keywords = '../data/keywords/labor/lang/'
# Cluster
else:
    path_to_tweets   = '/user/spf248/twitter/data/decahose/parsed/tweets/tweets-with-identified-location-'+country_code+'/'
#     path_to_tweets   = '/user/spf248/twitter/data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-extract/'
    path_to_mentions = '/user/spf248/twitter/data/decahose/parsed/tweets/tweets-with-identified-location-mentions/'
    path_to_locations = '/user/spf248/twitter/data/decahose/parsed/locations/' 
    path_to_keywords = '/user/spf248/twitter/data/keywords/labor/lang/'

Hostname: Samuels-MacBook-Pro.local


# Import Data

In [4]:
print('Import Dataset')
df = spark.read.parquet(path_to_tweets)

Import Dataset


In [5]:
print("Cache")
df.cache()

Cache


DataFrame[tweet_id: string, created_at: timestamp, text: string, tweet_lang: string, user_id: string, user_location: string, place_id: string, tweet_longitude: double, tweet_latitude: double]

In [6]:
print('Import Mentions')
mentions = spark.read.option('header','true').csv(path_to_keywords+language_code)
mentions = list(mentions.toPandas()['mention'])
print('# Mentions:',len(mentions))

Import Mentions
# Mentions: 22


In [7]:
print('Import Identified Locations')
identified_locations = spark.read.option('header','true').option("multiLine", "true").csv(path_to_locations+'account-locations-identified.csv')
identified_locations = identified_locations.where(identified_locations.country_short == country_code)
identified_locations = identified_locations.select(col('LOCATION').alias('user_location'),col('_c0').alias('location_id'))

Import Identified Locations


In [8]:
print("LOWERCASE")
df = df.withColumn('text', F.lower(F.col('text')))

print("SELECT LANGUAGE")
df = df.where(df.tweet_lang == language_code)

print("EXTRACT YEAR AND MONTH")
df = df.withColumn('year', year('created_at').cast("string"))
df = df.withColumn('month', month('created_at').cast("string"))

print("# OBS:", df.count())
# OBS: 2380339155

print('Merge Location Id')
df = df.join(identified_locations, on=['user_location'], how='inner')

print("# OBS:", df.count())

for mention in mentions:
    print(mention)
    field_mention = 'mention_'+mention.replace(' ','_')
    df = df.withColumn(field_mention, df.text.contains(mention).cast("int"))
    
print('Append Constant Column')
df = df.withColumn('n_tweets', lit(1))

df = df.drop('tweet_id','created_at','tweet_lang','user_id','place_id','tweet_longitude','tweet_latitude','user_location','text')

print("COUNT TWEETS AND MENTIONS BY YEAR MONTH AND USER LOCATION")
df = df.groupBy('year','month','location_id').sum()

LOWERCASE
SELECT LANGUAGE
EXTRACT YEAR AND MONTH
# OBS: 3583
Merge Location Id
# OBS: 1215
anyone hiring?
i am unemployed
Append Constant Column


In [14]:
print('SAVE TO CSV')
start = timer()

df.coalesce(1).write.mode("overwrite").csv(path_to_mentions+country_code+'.csv',header=True)

end = timer()
print('DONE IN', round(end - start), 'SEC')

SAVE TO CSV
DONE IN 1 SEC


In [10]:
print('DONE!')

Done!
