In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext, functions as F
from pyspark.sql.types import IntegerType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF

mongo_uri = "mongodb://hadoop-vm.internal.cloudapp.net:27017/ca2"

# Spark version 3.2.3
# MongoDB version 6.0.5
# Java Version 11

# create a spark session
# Jars dependencies available in maven repository
# https://mvnrepository.com/search?q=mongodb-driver-sync
spark = SparkSession.builder \
    .appName('Tweets') \
    .config("spark.mongodb.read.connection.uri", mongo_uri) \
    .config("spark.mongodb.write.connection.uri", mongo_uri) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-core:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-sync:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:bson:4.9.1") \
    .getOrCreate()


In [2]:
# read data from mongodb collection "tweets" into a dataframe "df"
df = spark.read \
    .format("mongodb") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", "vaccin_tweets_2") \
    .load()

# Data preparation

In [3]:
#cleaned_df = df.select(col('_id').alias('id'), to_timestamp('timestamp').alias('datetime'), 'text') \
#    .withColumn('cleaned_text', regexp_replace(col('text'), '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ''))
#cleaned_df.show()

# Sentiment analysis with pretrained model 

https://aclanthology.org/2020.findings-emnlp.148

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest



In [4]:
#!pip install tweetnlp

In [5]:
import tweetnlp
import pandas as pd
model = tweetnlp.Sentiment()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
pd_df = df.toPandas()

In [9]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
pd_df['sentiment'] = pd_df['text'].progress_apply(lambda x: model.predict(x))

 14%|██████████                                                             | 45279/317631 [1:17:52<5:34:08, 13.58it/s]

In [None]:
pd_df.to_csv("data.csv")

In [None]:
## Same using pys
def predict_sentiment(text):
    return model.predict(text)

sentiment_udf = F.udf(predict_sentiment, StringType())
df_with_sentiment = df.withColumn("sentiment", sentiment_udf(df["text"]))

In [None]:
df_with_sentiment.show()

In [None]:
df_parquet = spark.read.parquet("df.parquet.gzip")

In [None]:
pd_df = df_parquet.toPandas()[["timestamp","text","sentiment"]]

In [None]:
pd_df.to_csv("sentiment.csv")

In [None]:
pd_df.timestamp.max()