In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

$$\Large\text{Code}$$

**PairRDD**
- Related methods
    - `reduceByKeys(func)`
    - `groupByKey()`
    - `sortByKey()`
    - `joinByKey()`
- Some more actions
    - reduce()
    

$$\text{Creation}$$

In [17]:
# Method 1
my_tuple = [('Sam', 23), ('Mary', 34), ('Peter', 25)]
paraRDD_tuple = sc.parallelize(my_tuple)
print(paraRDD_tuple.collect())

[('Sam', 23), ('Mary', 34), ('Peter', 25)]


In [21]:
# Method 1
my_list = ['Sam 23', 'Mary 34','Peter 25']
regularRDD = sc.parallelize(my_list)
pairRDD_RDD = regularRDD.map(lambda s: (s.split(' ')[0], int(s.split(' ')[1])))
print(regularRDD.collect())
print(pairRDD_RDD.collect())

['Sam 23', 'Mary 34', 'Peter 25']
[('Sam', 23), ('Mary', 34), ('Peter', 25)]


$$\text{reduceByKeys(func)}$$

In [30]:
# List of tuples
my_tuple = [('Sam', 23), ('Mary', 34), ('Peter', 25), ('Mary', 2)]

# Create an RDD from the list
paraRDD_tuple = sc.parallelize(my_tuple)

# Reduce by key using reduceByKey
reduced_paraRDD_tuple = paraRDD_tuple.reduceByKey(lambda x, y: x + y)

# Collect and print the results
print(reduced_paraRDD_tuple.collect())

[('Sam', 23), ('Mary', 36), ('Peter', 25)]


$$\text{groupByKeys()}$$

In [28]:
# List of airports
airports = [('US', 'JFK'), ('UK', 'LHR'), ('FR', 'CGD'), ('US', 'SFO')]

# Create an RDD from the list
regularRDD = sc.parallelize(airports)

# Group by key
pairRDD_group = regularRDD.groupByKey()

# Collect the results
pairRDD_group_collected = pairRDD_group.collect()

# Print the grouped results
for cont, air in pairRDD_group_collected:
    print(cont, list(air))

# Print the types
print(type(pairRDD_group_collected))
print(type(pairRDD_group_collected[0]))
print(type(pairRDD_group_collected[0][0]))
print(type(pairRDD_group_collected[0][1]))

# Print the count of the RDD
print(type(pairRDD_group.count()))
print(pairRDD_group.count())

# Print the type of take of the RDD
print(type(pairRDD_group.take(2)))

US ['JFK', 'SFO']
UK ['LHR']
FR ['CGD']
<class 'list'>
<class 'tuple'>
<class 'str'>
<class 'pyspark.resultiterable.ResultIterable'>
<class 'int'>
3
<class 'list'>


$$\text{sortByKeys()}$$

In [32]:
# List of tuples with key-value pairs
my_tuple = [('Sam', 23), ('Mary', 34), ('Peter', 25), ('John', 30)]

# Create an RDD from the list
paraRDD_tuple = sc.parallelize(my_tuple)

# Sort the RDD by keys (ascending order by default)
sorted_RDD = paraRDD_tuple.sortByKey()

# Collect and print the results
print(sorted_RDD.collect())
sorted_desc_RDD = paraRDD_tuple.sortByKey(ascending=False)

# Collect and print the results
print(sorted_desc_RDD.collect())

[('John', 30), ('Mary', 34), ('Peter', 25), ('Sam', 23)]
[('Sam', 23), ('Peter', 25), ('Mary', 34), ('John', 30)]


$$\text{joinByKeys()}$$

In [31]:
# First RDD with key-value pairs
rdd1 = sc.parallelize([('Sam', 23), ('Mary', 34), ('Peter', 25)])

# Second RDD with key-value pairs
rdd2 = sc.parallelize([('Sam', 'Engineer'), ('Mary', 'Doctor'), ('John', 'Lawyer')])

# Perform an inner join on the RDDs
joined_RDD = rdd1.join(rdd2)

# Collect and print the results
print(joined_RDD.collect())
 
# Perform a left outer join
left_joined_RDD = rdd1.leftOuterJoin(rdd2)
# Collect and print the results
print(left_joined_RDD.collect())

# Perform a right outer join
right_joined_RDD = rdd1.rightOuterJoin(rdd2)
# Collect and print the results
print(right_joined_RDD.collect())
 

[('Mary', (34, 'Doctor')), ('Sam', (23, 'Engineer'))]
[('Mary', (34, 'Doctor')), ('Sam', (23, 'Engineer')), ('Peter', (25, None))]
[('Mary', (34, 'Doctor')), ('Sam', (23, 'Engineer')), ('John', (None, 'Lawyer'))]
