In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import warnings
import tweetnlp
import pandas as pd
import pyspark
import numpy as np
from tqdm import tqdm
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, DoubleType
from pyspark.sql.functions import udf
from pyspark.sql.functions import monotonically_increasing_id
from pymongo import MongoClient
warnings.filterwarnings("ignore")
tqdm.pandas()

In [3]:
mongo_uri = "mongodb://hadoop-vm.internal.cloudapp.net:27017/ca2"

# Spark version 3.2.3
# MongoDB version 6.0.5
# Java Version 11

# create a spark session
# Jars dependencies available in maven repository
# https://mvnrepository.com/search?q=mongodb-driver-sync
spark = SparkSession.builder \
    .appName('Tweets') \
    .config("spark.mongodb.read.connection.uri", mongo_uri) \
    .config("spark.mongodb.write.connection.uri", mongo_uri) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-core:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-sync:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:bson:4.9.1") \
    .getOrCreate()


In [4]:
# read data from mongodb collection "tweets" into a dataframe "df"
df = spark.read \
    .format("mongodb") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", "vaccin_tweets_2") \
    .load()

# Sentiment analysis with pretrained model 

https://aclanthology.org/2020.findings-emnlp.148

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest



In [5]:
#!pip install tweetnlp

In [6]:
model = tweetnlp.Sentiment()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Preprocess tweets to remove user info and any web url as Roberta 
def parse_tweet(tweet):
    new_text = []
    for t in tweet.split(" "):
        t = "@user" if t.startswith("@") and len(t) > 1 else t
        t = "http" if t.startswith("http") else t
        new_text.append(t)
    return " ".join(new_text)

# generate sentiment scores
def sentiment_analyzer(text, model):
    output = model.sentiment(text, return_probability=True)
    return (output['label'], output['probability'][output['label']])

def process_data(df, start_index, end_index, skip_to=0):
    total_count = df_pd['_id'].count()
    num_batches = (total_count // batch_size) + 1

    for i in range(num_batches):
        start = i * batch_size
        end = start + batch_size
        if(start < skip_to):
            continue
        
        print(f"processing from {start} to {end}...")
        # Select the batch of data
        batch_df = df[(df['index'] >= start) & (df['index'] < end)]
        
        # Apply the sentiment function to 'text' column
        batch_df['sentiment'], batch_df['s_probability'] = zip(*batch_df['text'].progress_apply(lambda x: sentiment_analyzer(x,model)))
        
        # Convert the pandas DataFrame to a list of dictionaries
        data_to_insert = batch_df[['_id','sentiment','s_probability']].to_dict('records')

        # Insert data into MongoDB
        collection.insert_many(data_to_insert)
        print(f"processed {end} of {total_count} - {round((end / total_count)):.2%}")


In [15]:
print(f"Loading tweets in memory")
df_pd = df.toPandas()
df_pd = df_pd[['_id','text']]
df_pd['text'] = df_pd['text'].apply(lambda x: parse_tweet(x))

# Add index to your dataframe
df_pd['index'] = df_pd.index


# Start client connection to mongo
client = MongoClient(mongo_uri)
db = client["ca2"]
collection = db["vaccin_tweets_2_sentiment"]

Loading tweets in memory


In [16]:
# set batch process
batch_size = 2000
start_index = 0
end_index = df_pd['index'].max()

In [17]:
print(f"Starting process_data")
process_data(df_pd,start_index,end_index,skip_to=292000)

Starting process_data
processing from 292000 to 294000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:35<00:00, 12.87it/s]


processed 294000 of 317631 - 100.00%
processing from 294000 to 296000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:50<00:00, 11.76it/s]


processed 296000 of 317631 - 100.00%
processing from 296000 to 298000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:47<00:00, 11.97it/s]


processed 298000 of 317631 - 100.00%
processing from 298000 to 300000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:43<00:00, 12.24it/s]


processed 300000 of 317631 - 100.00%
processing from 300000 to 302000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:44<00:00, 12.19it/s]


processed 302000 of 317631 - 100.00%
processing from 302000 to 304000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:44<00:00, 12.13it/s]


processed 304000 of 317631 - 100.00%
processing from 304000 to 306000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:46<00:00, 12.02it/s]


processed 306000 of 317631 - 100.00%
processing from 306000 to 308000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:44<00:00, 12.14it/s]


processed 308000 of 317631 - 100.00%
processing from 308000 to 310000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:39<00:00, 12.53it/s]


processed 310000 of 317631 - 100.00%
processing from 310000 to 312000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [03:08<00:00, 10.63it/s]


processed 312000 of 317631 - 100.00%
processing from 312000 to 314000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:57<00:00, 11.27it/s]


processed 314000 of 317631 - 100.00%
processing from 314000 to 316000...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:56<00:00, 11.35it/s]


processed 316000 of 317631 - 100.00%
processing from 316000 to 318000...


100%|██████████████████████████████████████████████████████████████████████████████| 1631/1631 [02:27<00:00, 11.08it/s]

processed 318000 of 317631 - 100.00%





In [18]:
print(f"*****End******")
client.close()

*****End******
