In [None]:
# Import necessary modules
import logging
from pyspark.sql import SparkSession

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize SparkSession
spark = SparkSession.builder.appName("RDD Example from Array").getOrCreate()

# Example data array (Python list)
data_array = ["Apache Spark", "is", "a unified analytics engine", "for large-scale data processing."]

# Parallelize the data array to create an RDD (Resilient Distributed Dataset)
rdd = spark.sparkContext.parallelize(data_array)

# Count the number of elements (items) in the RDD
num_elements = rdd.count()
logger.info(f"Number of elements in the RDD: {num_elements}")

# Count the total number of words across all items in the RDD
# - flatMap is used to split each string into words, creating a flat list of words
# - count then counts the total number of words
num_words = rdd.flatMap(lambda line: line.split(" ")).count()
logger.info(f"Number of words in the RDD: {num_words}")

# Stop the SparkSession to free up resources
spark.stop()