### Setting up Spark Session / Context

In [None]:
from pyspark.sql import SparkSession
from operator import add

spark_session = SparkSession.builder\
        .master("spark://192.168.2.35:7077") \
        .appName("Lecture1_Example2_with_spark")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("INFO")

### Loading data from local file system 

In [None]:
# read a file from local filesystem of your driver
lines = spark_context.textFile("/home/ubuntu/DE-2025/data/others/i_have_a_dream.txt")
lines.first()

In [None]:
lines.getNumPartitions()

### Loading the data from HDFS

In [None]:
# The same example, this time using map and reduce from the Spark API, and loading the text file from HDFS.

lines = spark_context.textFile("hdfs://192.168.2.35:9000/data/others/i_have_a_dream.txt")
print(lines.first())

words = lines.map(lambda line: line.split(' '))

word_counts = words.map(lambda w: len(w))

total_words = word_counts.reduce(add)

print(f'total words= {total_words}')  

# ... the same number of words?

In [None]:
lines.take(10)

In [None]:
lines_splitted = lines.map(lambda line: line.split(' '))
print(lines_splitted.first())

In [None]:
# Note, we're in Python, but using Java naming conventions!

all_words = lines.flatMap(lambda line: line.split(' '))
all_words.take(20)

In [None]:
all_words.filter(lambda word: word.startswith('d'))\
         .take(20)

In [None]:
# release the cores for another application!
spark_context.stop()