In [None]:
# Where is SPARK installed on the VM? Find and import the SPARK library
import findspark
findspark.init()

import pyspark

In [None]:
# First step: create the SPARK context. Without this, SPARK functionality cannot be used
sc = pyspark.SparkContext(appName="SPARkBasics")

In [None]:
# The next step will be creating an RDD (Resilient distributed dataset)
# RDDs can be created from many sources: by reading data from a file, from a Python list, etc.
# The following statement created an RDD from a Python list

a_list = [1,3,5,7,9,11,13,15,17,19,21]

# 'parallelize' function from the class sparkContext is used to create RDDs from in memory objects
rdd1 = sc.parallelize(a_list) 

print(type(a_list),"\n",type(rdd1))

In [None]:
# Now that we have the RDD, we can invoke the RDD functions on it (transformations and actions)
print(rdd1.count())
print(rdd1.first())
print(rdd1.take(3))
print(rdd1.top(3))
print(rdd1.sum())
print(rdd1.mean())
print(rdd1.min())
print(rdd1.max())
print(rdd1.sampleStdev())

# ... and many more
# Refer: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.html

In [None]:
# An RDD is distributed. If we want the RDD to get combined into a Python list (for printing, etc.) ... we use 'collect'
list_from_rdd = rdd1.collect()
print(list_from_rdd)

In [None]:
# Save an RDD into a text file
# Note : The following function will create a 'directory' with the name 'rdd_files' ...
# ... inside the directory there will be a number of files named 'part-XXXXX' where XXXXX will be 00000, 00001, and so oon
# The number of such 'part' files will depend on the number of 'partitions' of the RDD 
rdd1.saveAsTextFile("rdd_files")

In [None]:
# Printing some general information about the 'sc' 
print(sc.applicationId)
print(sc.defaultMinPartitions)
print(sc.defaultParallelism)
print(sc.version)

In [None]:
# We are done working with SPARK, so release the SPARK context created at the beginning
sc.stop()