In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.types import StructType, StructField, StringType
import os
import time

spark = SparkSession.builder \
     .master("local[*]") \
     .appName("Pyspark SQL") \
     .getOrCreate()

In [2]:
sc = spark.sparkContext

In [3]:
l = [1,2,3,4,5,6,7,8,9]
l

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [4]:
#Convert collection to RDD
l_RDD = sc.parallelize(l,3)
type(l_RDD)

pyspark.rdd.RDD

In [5]:
#To display no of partitions in RDD
l_RDD.getNumPartitions()

3

In [6]:
#Function takes partition as input
#returns each partition sum
def get_partition_sum(partition):
    total = 0
    for i in partition:
        total += i
    yield  total

In [7]:
#pass each partition of RDD as input to the function
#returns each partition sum
part_sum = l_RDD.mapPartitions(get_partition_sum)

In [8]:
#To display each partition wise content of the RDD
l_RDD.glom().collect()

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [9]:
part_sum.collect()

[6, 15, 24]

In [10]:
#To increase/reduce no of partitions we use repartition
#Repartition do full shuffles over the cluster
inc_par_RDD =l_RDD.repartition(10)

In [11]:
inc_par_RDD.getNumPartitions()

10

In [12]:
#Coalesce doesn't increase no of partitions
#It doesn't do shuffling, it can only reduce no of partitions
#So no shuffle then performance imporoves
coal_RDD = l_RDD.coalesce(10)

In [13]:
coal_RDD.getNumPartitions()

3

In [14]:
spark.stop()