In [29]:
import os
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [30]:
try:
    spark
except NameError:
    print('Create Cluster SparkSession')
    spark = SparkSession.builder.appName("").getOrCreate()

In [33]:
country_code = "PK"
path_to_data = '/user/spf248/twitter/data'

In [None]:
path_to_keywords = os.path.join(path_to_data,'keywords','hedonometer')

In [None]:
# KP = spark.read.parquet(os.path.join(path_to_data,'tweets','KP_Tweets.parquet'))

In [None]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
pipeline = Pipeline(stages=[regexTokenizer])

In [None]:
topics = [
'water',
'food',
'sanitation',
'energy',
'electric',
'petrol',
'shedding',
'internet',
'blackout',
'climate',
'healthcare',
'hospitals',
'vaccine',
'doctors',
'medicine',
'poverty',
'poor',
'education',
'universities',
'literacy',
'teachers',
'government',
'corruption',]

In [37]:
keyword2score = spark.read.option(
'header','true').option(
"inferSchema", "true").option(
"multiLine", "true").option(
"delimiter", ",").csv(os.path.join(path_to_keywords,'english-twitter.csv')).toPandas().set_index('WORD')['SCORE'].to_dict()
keyword2score_bc = spark.sparkContext.broadcast(keyword2score)

def scored_tokens(words):
    return [keyword2score_bc.value[word] for word in words if word in keyword2score_bc.value]
scored_tokens_udf = F.udf(scored_tokens, ArrayType(FloatType()))
array_mean_udf = F.udf(lambda x: float(np.mean(x)), FloatType())
array_length_udf = F.udf(lambda x: float(len(x)), FloatType())

In [28]:
def compute_sentiment(df,name):
    df = df.withColumnRenamed('clean_translation','text')
    df = df.withColumnRenamed('datetime','created_at')
    df = df.withColumn("year", F.year('created_at'))
    df = df.withColumn("month", F.month('created_at'))
    df = df.withColumn("day", F.dayofmonth('created_at'))
    df = df.select('year','month','day','text')
    df = df.withColumn('text',F.lower(F.col('text')))
    df = pipeline.fit(df).transform(df)
    df = df.withColumn('scored_tokens', scored_tokens_udf('words'))
    df = df.withColumn('mean_score', array_mean_udf('scored_tokens'))
    df = df.withColumn('n_tokens', array_length_udf('scored_tokens'))
    df = df.filter(df['n_tokens']>0)
    df.groupby('year','month','day').agg(F.mean('mean_score').alias('sentiment')).write.mode(
    "overwrite").parquet(os.path.join(path_to_data,'reports',country_code,'sentiment',name,'all'))
    for topic in topics:
        df = df.withColumn(topic,df.text.contains(topic).cast("int"))
        df.filter(df[topic]==1).groupby('year','month','day').agg(F.mean('mean_score').alias('sentiment')).write.mode(
        "overwrite").parquet(os.path.join(path_to_data,'reports',country_code,'sentiment',name,topic))

In [31]:
for year in range(2012,2021):
    print(year)
    compute_sentiment(spark.read.orc(os.path.join(path_to_data,'timelines','historical','extract',country_code,'year='+str(year))),str(year))

2012
2013
2014
2015
2016
2017
2018
2019
2020


In [34]:
KP = {}
KP['all'] = spark.read.orc(os.path.join(path_to_data,'reports',country_code,'sentiment','KP','all'))
for topic in topics:
    KP[topic] = spark.read.orc(os.path.join(path_to_data,'reports',country_code,'sentiment','KP',topic))

AnalysisException: 'Path does not exist: file:/user/spf248/twitter/data/reports/PK/sentiment/KP/all;'