In [3]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext, functions as F
from pyspark.sql.types import IntegerType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF

mongo_uri = "mongodb://hadoop-vm.internal.cloudapp.net:27017/ca2"

# Spark version 3.2.3
# MongoDB version 6.0.5
# Java Version 11

# create a spark session
# Jars dependencies available in maven repository
# https://mvnrepository.com/search?q=mongodb-driver-sync
spark = SparkSession.builder \
    .appName('Tweets') \
    .config("spark.mongodb.read.connection.uri", mongo_uri) \
    .config("spark.mongodb.write.connection.uri", mongo_uri) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-core:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-sync:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:bson:4.9.1") \
    .getOrCreate()


In [None]:
# read data from mongodb collection "tweets" into a dataframe "df"
df = spark.read \
    .format("mongodb") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", "vaccin_tweets") \
    .load()

# Data preparation

In [None]:
#cleaned_df = df.select(col('_id').alias('id'), to_timestamp('timestamp').alias('datetime'), 'text') \
#    .withColumn('cleaned_text', regexp_replace(col('text'), '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ''))
#cleaned_df.show()

# Sentiment analysis with pretrained model 

https://aclanthology.org/2020.findings-emnlp.148

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest



In [None]:
#!pip install tweetnlp

In [None]:
import tweetnlp
import pandas as pd
model = tweetnlp.Sentiment()

In [None]:
df.write.parquet("data.parquet") 

In [None]:
pd_df = df.toPandas()

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
pd_df['sentiment'] = pd_df['text'].progress_apply(lambda x: model.predict(x))

In [None]:
pd_df.to_csv("data.csv")

In [None]:
## Same using pys
def predict_sentiment(text):
    return model.predict(text)

sentiment_udf = F.udf(predict_sentiment, StringType())
df_with_sentiment = df.withColumn("sentiment", sentiment_udf(df["text"]))

In [None]:
df_with_sentiment.show()

In [4]:
df_parquet = spark.read.parquet("df.parquet.gzip")

In [9]:
df_parquet.show()

+-------------------+-----------+--------------------+--------------------+-------------+----------+
|                _id|coordinates|                text|           timestamp| timestamp_ms| sentiment|
+-------------------+-----------+--------------------+--------------------+-------------+----------+
|1344258913114447875|       null|RT @catturd2: 👀\...|2020-12-30 12:27:...|1609331274666| {neutral}|
|1344258934077599745|       null|RT @washingtonpos...|2020-12-30 12:27:...|1609331279664| {neutral}|
|1344259131209904128|       null|RT @ianbremmer: #...|2020-12-30 12:28:...|1609331326664| {neutral}|
|1344259231877390337|       null|RT @CNN: UK Healt...|2020-12-30 12:29:...|1609331350665|{positive}|
|1344259311569149954|       null|RT @88gaz88: @San...|2020-12-30 12:29:...|1609331369665|{negative}|
|1344259315733958656|       null|RT @gmanews: Dril...|2020-12-30 12:29:...|1609331370658|{negative}|
|1344259336739155969|       null|RT @BorisJohnson:...|2020-12-30 12:29:...|1609331375666|{po

In [10]:
pd_df = df_parquet.toPandas()[["timestamp","text","sentiment"]]

In [11]:
pd_df.to_csv("sentiment.csv")

In [15]:
pd_df.timestamp.max()

Timestamp('2021-01-01 06:58:46.659000')