In [None]:
# Pivot and Un-Pivot Data Frame

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Pivot & Un-Pivot") \
    .master("local[*]") \
    .getOrCreate()

spark

In [3]:
# Example Data Set

_data = [
	["Ramesh", "PHY", 90],
	["Ramesh", "MATH", 95],
	["Ramesh", "CHEM", 100],
	["Sangeeta", "PHY", 90],
	["Sangeeta", "MATH", 100],
	["Sangeeta", "CHEM", 83],
	["Mohan", "BIO", 90],
	["Mohan", "MATH", 70],
	["Mohan", "CHEM", 76],
	["Imran", "PHY", 96],
	["Imran", "MATH", 87],
	["Imran", "CHEM", 79],
	["Imran", "BIO", 82]
]

_cols = ["NAME", "SUBJECT", "MARKS"]

# Generate Data Frame
df = spark.createDataFrame(data=_data, schema = _cols)
df.show(truncate = False)

+--------+-------+-----+
|NAME    |SUBJECT|MARKS|
+--------+-------+-----+
|Ramesh  |PHY    |90   |
|Ramesh  |MATH   |95   |
|Ramesh  |CHEM   |100  |
|Sangeeta|PHY    |90   |
|Sangeeta|MATH   |100  |
|Sangeeta|CHEM   |83   |
|Mohan   |BIO    |90   |
|Mohan   |MATH   |70   |
|Mohan   |CHEM   |76   |
|Imran   |PHY    |96   |
|Imran   |MATH   |87   |
|Imran   |CHEM   |79   |
|Imran   |BIO    |82   |
+--------+-------+-----+



In [46]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

### Method 1 - Without specifying column names

In [83]:
# Pivot data without specifying the column names(values) and checking the execution time
from pyspark.sql.functions import sum

@get_time
def x(): df.groupBy("NAME").pivot("SUBJECT").agg(sum("MARKS"))

Execution time: 315.8917427062988 ms


In [84]:
# Lets check the data and schema
pivot_df_1 = df.groupBy("NAME").pivot("SUBJECT").agg(sum("MARKS"))
pivot_df_1.printSchema()
pivot_df_1.show(truncate = False)

root
 |-- NAME: string (nullable = true)
 |-- BIO: long (nullable = true)
 |-- CHEM: long (nullable = true)
 |-- MATH: long (nullable = true)
 |-- PHY: long (nullable = true)

+--------+----+----+----+----+
|NAME    |BIO |CHEM|MATH|PHY |
+--------+----+----+----+----+
|Mohan   |90  |76  |70  |null|
|Ramesh  |null|100 |95  |90  |
|Imran   |82  |79  |87  |96  |
|Sangeeta|null|83  |100 |90  |
+--------+----+----+----+----+



### Method 2 - Specifying column names

In [92]:
# Get the time for extracting the distinct list
@get_time
def x(): df.select("SUBJECT").distinct().rdd.map(lambda x: x[0]).collect()

Execution time: 463.40203285217285 ms


In [93]:
# Get the distinct list of Subjects
_subjects = df.select("SUBJECT").distinct().rdd.map(lambda x: x[0]).collect()
_subjects

['PHY', 'MATH', 'CHEM', 'BIO']

In [94]:
# Pivot data specifying the column names(values) and checking the execution time
from pyspark.sql.functions import sum

@get_time
def x(): df.groupBy("NAME").pivot("SUBJECT", _subjects).agg(sum("MARKS"))

Execution time: 22.889137268066406 ms


In [95]:
# Lets check the data and schema
pivot_df_2 = df.groupBy("NAME").pivot("SUBJECT", _subjects).agg(sum("MARKS"))
pivot_df_2.printSchema()
pivot_df_2.show(truncate = False)

root
 |-- NAME: string (nullable = true)
 |-- PHY: long (nullable = true)
 |-- MATH: long (nullable = true)
 |-- CHEM: long (nullable = true)
 |-- BIO: long (nullable = true)

+--------+----+----+----+----+
|NAME    |PHY |MATH|CHEM|BIO |
+--------+----+----+----+----+
|Mohan   |null|70  |76  |90  |
|Ramesh  |90  |95  |100 |null|
|Imran   |96  |87  |79  |82  |
|Sangeeta|90  |100 |83  |null|
+--------+----+----+----+----+

