In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext, functions as F
from pyspark.sql.functions import *

mongo_uri = "mongodb://hadoop-vm.internal.cloudapp.net:27017/ca2"

# Spark version 3.2.3
# MongoDB version 6.0.5
# Java Version 11

# create a spark session
# Jars dependencies available in maven repository
# https://mvnrepository.com/search?q=mongodb-driver-sync
spark = SparkSession.builder \
    .appName('Tweets') \
    .config("spark.mongodb.read.connection.uri", mongo_uri) \
    .config("spark.mongodb.write.connection.uri", mongo_uri) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-core:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-sync:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:bson:4.9.1") \
    .getOrCreate()

# read data from mongodb collection "tweets" into a dataframe "df"
df = spark.read \
    .format("mongodb") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", "tweets_202004") \
    .load()

df.printSchema()

root
 |-- _id: long (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |-- text: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestamp_ms: long (nullable = true)



## Tweets Statistics

### Total count tweets by laboratory

In [2]:
# No need to compare with lower case as we are looking for exact match on hastags
def contains_hashtags(column, hashtags):
    condition = None
    for hashtag in hashtags:
        if condition is None:
            condition = column.contains(hashtag)
        else:
            condition = condition | column.contains(hashtag)
    return condition


# Create topic tweet "Vaccin [ne, nation,]"

In [3]:
topic = "Vaccin"
df_vaccine = df.filter(lower(col("text")).contains(topic.lower()))
print(f"Total tweets for {topic} {df_vaccine.count()}")

Total tweets for Vaccin 15162


In [4]:
df_vaccine.write \
    .format("mongodb") \
    .mode("overwrite") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", f"{topic.lower()}_tweets_2_202004") \
    .save()