In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
import warnings
from timeit import default_timer as timer
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from pyspark.sql import Window
from pyspark.ml.feature import RegexTokenizer

ModuleNotFoundError: No module named 'pyspark'

# Config

In [2]:
try:
    spark
except NameError:
    if socket.gethostname() == 'FAC38c9860d5a89':
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "measure-tweets-sentiment").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "measure-tweets-sentiment").getOrCreate()
    
# Local
print('Hostname:', socket.gethostname())
if  'samuel' in socket.gethostname().lower():
    path_to_tweets   = '../data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-extract/'
    path_to_keywords = '../data/keywords/hedonometer/'
    path_to_sentiment= '../data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-sentiment/'
# Cluster
else:
    path_to_tweets   = '/user/spf248/twitter/data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-extract/'
    path_to_keywords = '/user/spf248/twitter/data/keywords/hedonometer/'
    path_to_sentiment= '/user/spf248/twitter/data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-sentiment/'

Hostname: Samuels-MacBook-Pro.local


# Import Data

In [3]:
print('Import:')
start = timer()

df = spark.read.parquet(path_to_tweets)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import:
Computing Time: 4 sec


In [4]:
print("CACHE DATASET")
df.cache()

CACHE DATASET


DataFrame[tweet_id: string, created_at: timestamp, text: string, tweet_lang: string, user_id: string, user_location: string, place_id: string, tweet_longitude: double, tweet_latitude: double]

# Clean Text

In [5]:
print("CLEAN TEXT")

punctuation = frozenset(list(string.punctuation)+['“','¿'])
def clean_text(text):
    
    text = re.sub(r"http\S+", "", text) # REMOVE URL
    text = "".join([char for char in text if char not in punctuation]) # REMOVE PUNCTUATION
    text = re.sub(r"[0-9]+", "", text) # REMOVE DIGITS
    text = re.sub(r"\n"," ", text).strip() # REMOVE EXTRA LINEBREAK
    text = re.sub(r" +"," ", text).strip() # REMOVE EXTRA SPACE
    return text.lower()

clean_text_udf = udf(clean_text,StringType())
df = df.withColumn('text',clean_text_udf('text'))

CLEAN TEXT


# Import Keywords Lists

In [5]:
languagenames = [
'english',
'french',
'german',
'indonesian',
'korean',
'portuguese',
'russian',
'spanish',
]

print('Languages:', ', '.join(languagenames))

languagecode2languagename = {
'en': 'english',
'fr': 'french',
'de': 'german',
'id': 'indonesian',
'ko': 'korean',
'pt': 'portuguese',
'ru': 'russian',
'es': 'spanish'}

languagename2keyword2score = {}

for languagename in languagenames:

    languagename2keyword2score[languagename] = \
    spark.read.option(
    'header','true').option(
    "inferSchema", "true").csv(
    path_to_keywords+languagename+'-twitter.csv').toPandas().set_index('WORD')['SCORE'].to_dict()

# Create Broadcast Variable
languagecode2languagename_bc  = spark.sparkContext.broadcast(languagecode2languagename)
languagename2keyword2score_bc = spark.sparkContext.broadcast(languagename2keyword2score)

Languages: english, french, german, indonesian, korean, portuguese, russian, spanish


# Compute Sentiment

In [6]:
print("TOKENIZE TEXT")

tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern=" ")
df = tokenizer.transform(df)
df = df.drop('text')

TOKENIZE TEXT


In [7]:
print("SCORE TOKENS")

scored_tokens_schema = StructType([
    StructField("n_score", FloatType(), False),
    StructField("avg_score", FloatType(), False)
])

def scored_tokens(tokens,languagecode):
    
    if languagecode in languagecode2languagename_bc.value:
        
        languagename = languagecode2languagename_bc.value[languagecode]
        
        scored_tokens = [
        languagename2keyword2score_bc.value[languagename][token] for token in tokens 
        if token in languagename2keyword2score_bc.value[languagename]]
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            return (float(len(scored_tokens)), float(np.nanmean(scored_tokens)))
    
    else:
        
        return (float(np.nan), float(np.nan))

scored_tokens_udf = udf(scored_tokens, scored_tokens_schema)

# Compute Score Tokens
df = df.withColumn('scored_tokens', scored_tokens_udf('tokens','tweet_lang'))
df = df.withColumn('avg_score',F.col('scored_tokens').getItem('avg_score'))
df = df.withColumn('n_score',F.col('scored_tokens').getItem('n_score'))
df = df.drop('scored_tokens')

# Count Tokens
df = df.withColumn('n_tokens',size('tokens'))
df = df.drop('tokens')

SCORE TOKENS


In [9]:
print('SAVE TO PARQUET')
start = timer()

df.write.mode("overwrite").parquet(path_to_sentiment)

end = timer()
print('DONE IN', round(end - start), 'SEC')

Save
Computing Time: 21 sec


In [10]:
print('DONE!')

Done!


# Comparison to Pandas

In [9]:
ds = df.toPandas()

In [10]:
ds = ds.sort_values(by='tweet_id').reset_index(drop=True)

In [11]:
ds.head()

Unnamed: 0,tweet_id,user_id,user_location,tweet_longitude,tweet_latitude,place_id,year,month,day_of_month,day_of_week,avg_score,n_score,n_tokens,tweet_lang
0,812405126879121408,811538749,vimercate (mb),,,c90750b5edc76f6c,2016,12,23,6,,,4,it
1,812405135255158785,434359026,,-118.338292,34.101555,3b77caf94bfc81fe,2016,12,23,6,4.984,5.0,11,en
2,812405139478810624,1864954177,Longe,,,68e019afec7d0ba5,2016,12,23,6,3.21,2.0,4,es
3,812405143673143296,2210256804,Acosando a Battler y a Kuroki,,,9de57b9239869b74,2016,12,23,6,6.4,1.0,2,es
4,812405152044855296,3299672999,"Texas, USA",,,228a068876235841,2016,12,23,6,5.6475,8.0,13,en


In [12]:
import pandas as pd

In [13]:
path = '../data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-from-decahose-partition-0-block-0.json.bz2'

dp = pd.read_json(
path,
orient='records',
dtype=False,
precise_float=True,
convert_dates=False)

dp = dp[[
'coordinates',
'created_at',
'extended_tweet',
'id_str',
'lang',
'text',
'truncated',
'user',
'place']].copy()

dp = dp.drop_duplicates('id_str')

dp['place_id'] = dp['place'].apply(lambda x:x['id'] if type(x)==dict else None)
dp['user_id'] = dp['user'].apply(lambda x:x['id_str'])
dp['user_location'] = dp['user'].apply(lambda x:x['location'])
dp.drop(['user','place'],1,inplace=True)

dp['year']  = pd.to_datetime(dp['created_at']).apply(lambda x:x.year)
dp['month'] = pd.to_datetime(dp['created_at']).apply(lambda x:x.month)
dp['day_of_month'] = pd.to_datetime(dp['created_at']).apply(lambda x:x.day)
dp['day_of_week']  = pd.to_datetime(dp['created_at']).apply(lambda x:x.dayofweek)
dp.drop('created_at',1,inplace=True)

dp['full_text'] = dp['extended_tweet'].apply(lambda x:x['full_text'] if type(x)==dict else None)
dp.loc[dp['truncated']==True,'text'] = dp.loc[dp['truncated']==True,'full_text']
dp.drop(['full_text','truncated','extended_tweet'],1,inplace=True)

dp['tweet_longitude'] = dp['coordinates'].apply(lambda x:x['coordinates'][0] if x else np.nan)
dp['tweet_latitude']  = dp['coordinates'].apply(lambda x:x['coordinates'][1] if x else np.nan)
dp.drop('coordinates',1,inplace=True)
dp.rename(columns={'id_str':'tweet_id','lang':'tweet_lang'},inplace=True)

dp['text'] = dp['text'].apply(clean_text)
dp['tokens'] = dp['text'].apply(lambda x:x.split())
dp.drop('text',1,inplace=True)

In [14]:
def score_tokens(x):
    
    if x['tweet_lang'] in languagecode2languagename:
        
        languagename = languagecode2languagename[x['tweet_lang']]

        scored_tokens = [
        languagename2keyword2score[languagename][token] for token in x['tokens']
        if token in languagename2keyword2score[languagename]]
    
        scores = (float(len(scored_tokens)), float(np.nanmean(scored_tokens)))
    
    else:
        
        scores = (float(np.nan), float(np.nan))
    
    return pd.Series(scores, index = ['n_score','avg_score'])

dp = pd.concat([dp,dp.apply(score_tokens,1)],1)
dp['n_tokens'] = dp['tokens'].apply(len)
dp.drop('tokens',1,inplace=True)

dp = dp.sort_values(by='tweet_id').reset_index(drop=True)

  # This is added back by InteractiveShellApp.init_path()


In [15]:
print('Check Differences Per Column:')
for col in dp.columns:
    idx = pd.concat([dp[col],ds[col]],1).dropna().index
    print('Column', col, '# obs',idx.shape[0],'-> value differences:', (ds.loc[idx,col]!=dp.loc[idx,col]).sum())

Check Differences Per Column:
Column tweet_id # obs 7560 -> value differences: 0
Column tweet_lang # obs 7560 -> value differences: 0
Column place_id # obs 7546 -> value differences: 0
Column user_id # obs 7560 -> value differences: 0
Column user_location # obs 6191 -> value differences: 0
Column year # obs 7560 -> value differences: 0
Column month # obs 7560 -> value differences: 0
Column day_of_month # obs 7560 -> value differences: 0
Column day_of_week # obs 7560 -> value differences: 7560
Column tweet_longitude # obs 1131 -> value differences: 0
Column tweet_latitude # obs 1131 -> value differences: 0
Column n_score # obs 5814 -> value differences: 3
Column avg_score # obs 5670 -> value differences: 5586
Column n_tokens # obs 7560 -> value differences: 8


In [16]:
col = 'avg_score'
pd.concat([dp[col],ds[col]],1).dropna().describe()

Unnamed: 0,avg_score,avg_score.1
count,5670.0,5670.0
mean,5.623397,5.62339
std,0.592129,0.592136
min,1.86,1.86
25%,5.308616,5.308359
50%,5.589615,5.589615
75%,5.881765,5.881765
max,8.46,8.46
