In [1]:
from pyspark.sql import SparkSession

### Create SparkContext object

In [2]:
# If we are running this on a cluster - master should be set with the master_name,
# which would usually by YARN or mesos, depending on the cluster type

# Here, in local[x], x denotes the number of partitions should be created when using RDD,
# DataFrame and Dataset. 

# Ideally x => number of CPU cores.

spark = SparkSession.builder.master("local[1]").appName("sid_spark").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/20 17:37:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Create RDD from Python list

In [3]:
# Creates an RDD from Python list
rdd = spark.sparkContext.parallelize([1,2,3,4,5])

In [7]:
print(rdd.count())

5


In [5]:
print("Number of partitions in RDD:", rdd.getNumPartitions())

Number of partitions in RDD: 1


### Manually set the number of partitions in an RDD

In [6]:
# To specify manually how many partitions to use
rdd_partitioned = spark.sparkContext.parallelize([1,2,3,4,5], 5)

print("Number of partitions in RDD:", rdd_partitioned.getNumPartitions())

Number of partitions in RDD: 5


### Create empty RDD

In [8]:
# empty RDD with no partition
rdd_empty = spark.sparkContext.emptyRDD

In [9]:
# empty RDD with manual partition
rdd_empty_partitioned = spark.sparkContext.parallelize([], 10)

### Repartition and Coalesce

Repartitioning and coalesce changes the number of partitions in the RDD to the number specified.
Repartition shuffles the entire data, but coalesce will shuffle only the minimum required number.

For e.g., in an RDD with 4 partitions:

`repartition(2)`: shuffles all 4 partitions and creates 2 new partitions

`coalesce(2)`: shuffles `4-2=2` partitions and gives resultant 2 partitions

In [10]:
repart_rdd = rdd_partitioned.repartition(4)
coal_rdd = rdd_partitioned.coalesce(2)

print("Number of partitions:", repart_rdd.getNumPartitions())
print("Number of partitions:", coal_rdd.getNumPartitions())

Number of partitions: 4
Number of partitions: 2
