<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/17_spark_dataframe_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Spark DataFrame Methods**
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html

In [None]:
# Install Java and PySpark
!apt-get update -qq
!apt-get install -y openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark -q

# Set Java home
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

import pyspark
print(pyspark.__version__)

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
3.5.1


In [None]:
!pip install "numpy<2.0"



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-dataframe').getOrCreate()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, DateType
from pyspark.sql.functions import col, lit
from datetime import datetime, date

data = [
    # Complete records
    (1, "Alice", "Engineering", 75000.50, 28, True, date(2020, 1, 15), "New York"),
    (2, "Bob", "Marketing", 65000.75, 32, False, date(2019, 3, 20), "San Francisco"),
    (3, "Charlie", "Engineering", 82000.25, 35, True, date(2018, 7, 10), "New York"),
    (4, "Diana", "Sales", 58000.00, 29, True, date(2021, 5, 5), "Chicago"),
    (5, "Eve", "HR", 62000.80, 31, False, date(2020, 11, 30), "Boston"),

    # Records with some null values
    (6, "Frank", None, 71000.40, 40, True, date(2017, 8, 25), None),
    (7, None, "Engineering", 68000.60, 27, False, date(2022, 2, 14), "Seattle"),
    (8, "Grace", "Marketing", None, 33, True, date(2019, 9, 8), "Austin"),
    (9, "Henry", "Sales", 59000.90, None, False, date(2021, 12, 1), "Denver"),
    (10, "Ivy", "HR", 63000.30, 36, None, None, "Portland"),

    # Edge cases
    (11, "", "Engineering", 0.0, 0, False, date(2023, 1, 1), ""),
    (12, "Jack", "Sales", 1000000.99, 99, True, date(2015, 12, 31), "Miami"),
    (13, "Karen", "Marketing", 45000.00, 22, True, date(2023, 6, 15), "Atlanta"),
    (14, "Leo", "Engineering", 95000.00, 45, False, date(2016, 4, 18), "New York"),
    (15, "Mona", None, 52000.50, 26, True, date(2022, 8, 9), "Chicago")
]

# Define schema
schema = StructType([
    StructField("employee_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", DoubleType(), True),
    StructField("age", IntegerType(), True),
    StructField("is_active", BooleanType(), True),
    StructField("hire_date", DateType(), True),
    StructField("city", StringType(), True)
])

dataframe = spark.createDataFrame(data, schema)
dataframe.printSchema()
dataframe.show(truncate = False)

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_active: boolean (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- city: string (nullable = true)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|

In [None]:
# show : return a dataframe into the console

dataframe.show(n = 10, truncate = False, vertical=False)

+-----------+-------+-----------+--------+----+---------+----------+-------------+
|employee_id|name   |department |salary  |age |is_active|hire_date |city         |
+-----------+-------+-----------+--------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5 |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75|32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25|35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0 |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8 |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4 |40  |true     |2017-08-25|NULL         |
|7          |NULL   |Engineering|68000.6 |27  |false    |2022-02-14|Seattle      |
|8          |Grace  |Marketing  |NULL    |33  |true     |2019-09-08|Austin       |
|9          |Henry  |Sales      |59000.9 |NULL|false    |2021-12-01|Denver       |
|10 

In [None]:
# first() : return the pyspark row

print(type(dataframe.first()))

<class 'pyspark.sql.types.Row'>


In [None]:
# head() : returns a list of pyspark row

print(type(dataframe.head(2)),'\n')
dataframe.head(2)

<class 'list'> 



[Row(employee_id=1, name='Alice', department='Engineering', salary=75000.5, age=28, is_active=True, hire_date=datetime.date(2020, 1, 15), city='New York'),
 Row(employee_id=2, name='Bob', department='Marketing', salary=65000.75, age=32, is_active=False, hire_date=datetime.date(2019, 3, 20), city='San Francisco')]

In [None]:
# take : returns a list of pyspark row

print(type(dataframe.take(2)),'\n')
dataframe.take(2)

<class 'list'> 



[Row(employee_id=1, name='Alice', department='Engineering', salary=75000.5, age=28, is_active=True, hire_date=datetime.date(2020, 1, 15), city='New York'),
 Row(employee_id=2, name='Bob', department='Marketing', salary=65000.75, age=32, is_active=False, hire_date=datetime.date(2019, 3, 20), city='San Francisco')]

In [None]:
# collect

# take : returns a list of pyspark row, BUT all the rows in a dataframe

dataframe.collect()

[Row(employee_id=1, name='Alice', department='Engineering', salary=75000.5, age=28, is_active=True, hire_date=datetime.date(2020, 1, 15), city='New York'),
 Row(employee_id=2, name='Bob', department='Marketing', salary=65000.75, age=32, is_active=False, hire_date=datetime.date(2019, 3, 20), city='San Francisco'),
 Row(employee_id=3, name='Charlie', department='Engineering', salary=82000.25, age=35, is_active=True, hire_date=datetime.date(2018, 7, 10), city='New York'),
 Row(employee_id=4, name='Diana', department='Sales', salary=58000.0, age=29, is_active=True, hire_date=datetime.date(2021, 5, 5), city='Chicago'),
 Row(employee_id=5, name='Eve', department='HR', salary=62000.8, age=31, is_active=False, hire_date=datetime.date(2020, 11, 30), city='Boston'),
 Row(employee_id=6, name='Frank', department=None, salary=71000.4, age=40, is_active=True, hire_date=datetime.date(2017, 8, 25), city=None),
 Row(employee_id=7, name=None, department='Engineering', salary=68000.6, age=27, is_active=F

| Method    | Returns                    | Usage Example    | Notes                          |
|-----------|----------------------------|------------------|--------------------------------|
| `first()` | First row as Row object    | `df.first()`     | Same as `head(1)[0]`           |
| `head()`  | First row (or n rows)      | `df.head()` or `df.head(5)` | Default returns first row only |
| `take(n)` | First n rows as list       | `df.take(3)`     | Returns list of Row objects    |
| `collect()`| All rows as list          | `df.collect()`   | ‚ö†Ô∏è Brings ALL data to driver  |
|
# **üö® IMPORTANT: All of these methods are ACTIONS - they trigger computation and bring data from executors to the driver!**

In [None]:
# isEmpty()

dataframe.isEmpty()

False

In [None]:
# count

dataframe.count()

15

In [None]:
# printSchema() : printing the schema into the console

dataframe.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_active: boolean (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- city: string (nullable = true)



In [None]:
# schema

# it is not a method, it is an attibute of the dataframe object

dataframe.schema

StructType([StructField('employee_id', IntegerType(), True), StructField('name', StringType(), True), StructField('department', StringType(), True), StructField('salary', DoubleType(), True), StructField('age', IntegerType(), True), StructField('is_active', BooleanType(), True), StructField('hire_date', DateType(), True), StructField('city', StringType(), True)])

In [None]:
# dtypes

# it is not a method, it is an attibute of the dataframe object

dataframe.dtypes

[('employee_id', 'int'),
 ('name', 'string'),
 ('department', 'string'),
 ('salary', 'double'),
 ('age', 'int'),
 ('is_active', 'boolean'),
 ('hire_date', 'date'),
 ('city', 'string')]

In [None]:
# columns

# it is not a method, it is an attibute of the dataframe object

dataframe.columns

['employee_id',
 'name',
 'department',
 'salary',
 'age',
 'is_active',
 'hire_date',
 'city']

In [None]:
# describe()

dataframe.describe()

DataFrame[summary: string, employee_id: string, name: string, department: string, salary: string, age: string, city: string]

In [None]:
# summary

dataframe.summary()

DataFrame[summary: string, employee_id: string, name: string, department: string, salary: string, age: string, city: string]

###### Note : Describe and Summary gives you the same information

In [None]:
# select

dataframe.select('*').show(n = 2)
dataframe.select('employee_id','name').show(n = 2)

+-----------+-----+-----------+--------+---+---------+----------+-------------+
|employee_id| name| department|  salary|age|is_active| hire_date|         city|
+-----------+-----+-----------+--------+---+---------+----------+-------------+
|          1|Alice|Engineering| 75000.5| 28|     true|2020-01-15|     New York|
|          2|  Bob|  Marketing|65000.75| 32|    false|2019-03-20|San Francisco|
+-----------+-----+-----------+--------+---+---------+----------+-------------+
only showing top 2 rows

+-----------+-----+
|employee_id| name|
+-----------+-----+
|          1|Alice|
|          2|  Bob|
+-----------+-----+
only showing top 2 rows



In [None]:
# selectExpr

dataframe.selectExpr('employee_id','upper(name) as upperName').show(2)
dataframe.selectExpr('employee_id','''name||'-'|| department as name_and_department''').show(2, truncate = False)

+-----------+---------+
|employee_id|upperName|
+-----------+---------+
|          1|    ALICE|
|          2|      BOB|
+-----------+---------+
only showing top 2 rows

+-----------+-------------------+
|employee_id|name_and_department|
+-----------+-------------------+
|1          |Alice-Engineering  |
|2          |Bob-Marketing      |
+-----------+-------------------+
only showing top 2 rows



In [None]:
# colRegex() : to find the iterrator for columns what align to a string regex patten, and is used along with select method to fetch certain columns dynamically

colsToSelect = dataframe.colRegex("`[a,c].*`")
print('\n',colsToSelect,'\n')

dataframe.select(colsToSelect).show(2, truncate = False)


 Column<'unresolvedregex()'> 

+---+-------------+
|age|city         |
+---+-------------+
|28 |New York     |
|32 |San Francisco|
+---+-------------+
only showing top 2 rows



In [None]:
# withColumn()

from pyspark.sql.functions import expr

dataframe.withColumn('name_and_departmenmt', expr('''name ||'-'|| department as name_department''')).show(5, truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+-------------+--------------------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city         |name_and_departmenmt|
+-----------+-------+-----------+--------+---+---------+----------+-------------+--------------------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York     |Alice-Engineering   |
|2          |Bob    |Marketing  |65000.75|32 |false    |2019-03-20|San Francisco|Bob-Marketing       |
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York     |Charlie-Engineering |
|4          |Diana  |Sales      |58000.0 |29 |true     |2021-05-05|Chicago      |Diana-Sales         |
|5          |Eve    |HR         |62000.8 |31 |false    |2020-11-30|Boston       |Eve-HR              |
+-----------+-------+-----------+--------+---+---------+----------+-------------+--------------------+
only showing top 5 rows



In [None]:
# withColumns()

dataframe.withColumns({'upperName':expr('upper(name)'),
                       'is_active_binary': expr('case when is_active = True then 1 else 0 end')})\
                       .show(5, truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+-------------+---------+----------------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city         |upperName|is_active_binary|
+-----------+-------+-----------+--------+---+---------+----------+-------------+---------+----------------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York     |ALICE    |1               |
|2          |Bob    |Marketing  |65000.75|32 |false    |2019-03-20|San Francisco|BOB      |0               |
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York     |CHARLIE  |1               |
|4          |Diana  |Sales      |58000.0 |29 |true     |2021-05-05|Chicago      |DIANA    |1               |
|5          |Eve    |HR         |62000.8 |31 |false    |2020-11-30|Boston       |EVE      |0               |
+-----------+-------+-----------+--------+---+---------+----------+-------------+---------+----------------+
only showing top 5 

In [None]:
# withColumnRenamed

dataframe.withColumnRenamed('name','employee').show(5, truncate = False)

+-----------+--------+-----------+--------+---+---------+----------+-------------+
|employee_id|employee|department |salary  |age|is_active|hire_date |city         |
+-----------+--------+-----------+--------+---+---------+----------+-------------+
|1          |Alice   |Engineering|75000.5 |28 |true     |2020-01-15|New York     |
|2          |Bob     |Marketing  |65000.75|32 |false    |2019-03-20|San Francisco|
|3          |Charlie |Engineering|82000.25|35 |true     |2018-07-10|New York     |
|4          |Diana   |Sales      |58000.0 |29 |true     |2021-05-05|Chicago      |
|5          |Eve     |HR         |62000.8 |31 |false    |2020-11-30|Boston       |
+-----------+--------+-----------+--------+---+---------+----------+-------------+
only showing top 5 rows



In [None]:
# withColumnsRenamed

dataframe.withColumnsRenamed({'name':'employee',
                              'hire_date':'joiningDate'}).show(5, truncate = False)

+-----------+--------+-----------+--------+---+---------+-----------+-------------+
|employee_id|employee|department |salary  |age|is_active|joiningDate|city         |
+-----------+--------+-----------+--------+---+---------+-----------+-------------+
|1          |Alice   |Engineering|75000.5 |28 |true     |2020-01-15 |New York     |
|2          |Bob     |Marketing  |65000.75|32 |false    |2019-03-20 |San Francisco|
|3          |Charlie |Engineering|82000.25|35 |true     |2018-07-10 |New York     |
|4          |Diana   |Sales      |58000.0 |29 |true     |2021-05-05 |Chicago      |
|5          |Eve     |HR         |62000.8 |31 |false    |2020-11-30 |Boston       |
+-----------+--------+-----------+--------+---+---------+-----------+-------------+
only showing top 5 rows



In [None]:
# drop

dataframe.drop('salary').show(5, truncate = False)

dataframe.drop('salary','age').show(5, truncate = False)

+-----------+-------+-----------+---+---------+----------+-------------+
|employee_id|name   |department |age|is_active|hire_date |city         |
+-----------+-------+-----------+---+---------+----------+-------------+
|1          |Alice  |Engineering|28 |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |32 |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|35 |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |29 |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |31 |false    |2020-11-30|Boston       |
+-----------+-------+-----------+---+---------+----------+-------------+
only showing top 5 rows

+-----------+-------+-----------+---------+----------+-------------+
|employee_id|name   |department |is_active|hire_date |city         |
+-----------+-------+-----------+---------+----------+-------------+
|1          |Alice  |Engineering|true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |false    

In [None]:
# filter

from pyspark.sql.functions import lit

dataframe.filter(col('department').eqNullSafe(lit('Engineering'))).show(5, truncate = False)

dataframe.filter(expr('''department = 'Engineering' ''')).show(5, truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York|
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York|
|7          |NULL   |Engineering|68000.6 |27 |false    |2022-02-14|Seattle |
|11         |       |Engineering|0.0     |0  |false    |2023-01-01|        |
|14         |Leo    |Engineering|95000.0 |45 |false    |2016-04-18|New York|
+-----------+-------+-----------+--------+---+---------+----------+--------+

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York

In [None]:
# where

from pyspark.sql.functions import lit

dataframe.where(col('department').eqNullSafe(lit('Engineering'))).show(5, truncate = False)

dataframe.where(expr('''department = 'Engineering' ''')).show(5, truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York|
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York|
|7          |NULL   |Engineering|68000.6 |27 |false    |2022-02-14|Seattle |
|11         |       |Engineering|0.0     |0  |false    |2023-01-01|        |
|14         |Leo    |Engineering|95000.0 |45 |false    |2016-04-18|New York|
+-----------+-------+-----------+--------+---+---------+----------+--------+

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York

#### Note : where is simply an alias for filter

In [None]:
# sort

from pyspark.sql.functions import asc_nulls_last

dataframe.sort(asc_nulls_last('department')).show(5, truncate = False)
dataframe.sort(asc_nulls_last('department'),asc_nulls_last('name')).show(5, truncate = False)

# dataframe.sort(expr('''department asc nulls last, name asc nulls last''')).show(5, truncate = False) -- THIS DOES NOT WORK, as expr does not support nulls last
# sorting with multiple columns are not supported by expr anyways

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York|
|11         |       |Engineering|0.0     |0  |false    |2023-01-01|        |
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York|
|14         |Leo    |Engineering|95000.0 |45 |false    |2016-04-18|New York|
|7          |NULL   |Engineering|68000.6 |27 |false    |2022-02-14|Seattle |
+-----------+-------+-----------+--------+---+---------+----------+--------+
only showing top 5 rows

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|11         |       |Engineering|0.0     |0  |false

In [None]:
# orderBy

from pyspark.sql.functions import asc_nulls_last

dataframe.orderBy(asc_nulls_last('department')).show(5, truncate = False)
dataframe.orderBy(asc_nulls_last('department'),asc_nulls_last('name')).show(5, truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York|
|11         |       |Engineering|0.0     |0  |false    |2023-01-01|        |
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York|
|14         |Leo    |Engineering|95000.0 |45 |false    |2016-04-18|New York|
|7          |NULL   |Engineering|68000.6 |27 |false    |2022-02-14|Seattle |
+-----------+-------+-----------+--------+---+---------+----------+--------+
only showing top 5 rows

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|11         |       |Engineering|0.0     |0  |false

In [None]:
# sortWithinPartitions

dataframe.sortWithinPartitions('department').show(5, truncate = False)
dataframe.sortWithinPartitions(asc_nulls_last('department')).show(5, truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|6          |Frank  |NULL       |71000.4 |40 |true     |2017-08-25|NULL    |
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York|
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York|
|7          |NULL   |Engineering|68000.6 |27 |false    |2022-02-14|Seattle |
|5          |Eve    |HR         |62000.8 |31 |false    |2020-11-30|Boston  |
+-----------+-------+-----------+--------+---+---------+----------+--------+
only showing top 5 rows

+-----------+-------+-----------+--------+---+---------+----------+-------------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city         |
+-----------+-------+-----------+--------+---+---------+----------+-------------+
|1          |Alice  |Engineering|750

| Aspect          | `sort()`                           | `orderBy()`                       | `sortWithinPartitions()`          |
|-----------------|------------------------------------|-----------------------------------|-----------------------------------|
| **Purpose**     | Global sorting across all data     | Alias for `sort()` - same function| Sort data within each partition   |
| **Scope**       | **Global** - entire dataset        | **Global** - entire dataset       | **Local** - per partition         |
| **Performance** | Slower (full shuffle)              | Same as `sort()`                  | Faster (no shuffle)               |
| **Use Case**    | Need total order across partitions | Same as `sort()`                  | Order matters within partitions   |

In [None]:
#  limit : it returns a dataframe, but the show return the output to the console

dataframe.limit(10).show(truncate = False)

+-----------+-------+-----------+--------+----+---------+----------+-------------+
|employee_id|name   |department |salary  |age |is_active|hire_date |city         |
+-----------+-------+-----------+--------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5 |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75|32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25|35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0 |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8 |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4 |40  |true     |2017-08-25|NULL         |
|7          |NULL   |Engineering|68000.6 |27  |false    |2022-02-14|Seattle      |
|8          |Grace  |Marketing  |NULL    |33  |true     |2019-09-08|Austin       |
|9          |Henry  |Sales      |59000.9 |NULL|false    |2021-12-01|Denver       |
|10 

In [None]:
# offset

dataframe.offset(10).show(truncate = False)

## a fancy operation to lookat

dataframe.offset(10).limit(3).filter(~col('name').eqNullSafe('')).select('employee_id','name').show(truncate = False)

+-----------+-----+-----------+----------+---+---------+----------+--------+
|employee_id|name |department |salary    |age|is_active|hire_date |city    |
+-----------+-----+-----------+----------+---+---------+----------+--------+
|11         |     |Engineering|0.0       |0  |false    |2023-01-01|        |
|12         |Jack |Sales      |1000000.99|99 |true     |2015-12-31|Miami   |
|13         |Karen|Marketing  |45000.0   |22 |true     |2023-06-15|Atlanta |
|14         |Leo  |Engineering|95000.0   |45 |false    |2016-04-18|New York|
|15         |Mona |NULL       |52000.5   |26 |true     |2022-08-09|Chicago |
+-----------+-----+-----------+----------+---+---------+----------+--------+

+-----------+-----+
|employee_id|name |
+-----------+-----+
|12         |Jack |
|13         |Karen|
+-----------+-----+



In [None]:
# distinct

dataframe.distinct().show(5, truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+-------------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city         |
+-----------+-------+-----------+--------+---+---------+----------+-------------+
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0 |29 |true     |2021-05-05|Chicago      |
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York     |
|5          |Eve    |HR         |62000.8 |31 |false    |2020-11-30|Boston       |
|2          |Bob    |Marketing  |65000.75|32 |false    |2019-03-20|San Francisco|
+-----------+-------+-----------+--------+---+---------+----------+-------------+
only showing top 5 rows



In [None]:
# dropDuplicates

dataframe.dropDuplicates().show(5, truncate = False)

dataframe.dropDuplicates(['department','city']).show(5, truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+-------------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city         |
+-----------+-------+-----------+--------+---+---------+----------+-------------+
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0 |29 |true     |2021-05-05|Chicago      |
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York     |
|5          |Eve    |HR         |62000.8 |31 |false    |2020-11-30|Boston       |
|2          |Bob    |Marketing  |65000.75|32 |false    |2019-03-20|San Francisco|
+-----------+-------+-----------+--------+---+---------+----------+-------------+
only showing top 5 rows

+-----------+-----+-----------+-------+---+---------+----------+--------+
|employee_id|name |department |salary |age|is_active|hire_date |city    |
+-----------+-----+-----------+-------+---+---------+----------+--------+
|6          |Fr

| Aspect | `distinct()` | `dropDuplicates()` |
|--------|--------------|-------------------|
| **What it does** | Returns distinct **rows** across all columns | Returns distinct rows, can specify columns |
| **Column scope** | **All columns** only | **All columns** OR **specific columns** |
| **Usage** | `df.distinct()` | `df.dropDuplicates()` or `df.dropDuplicates(['col1','col2'])` |
| **Performance** | Same (both cause shuffles) | Same (both cause shuffles) |
| **Result** | Unique rows based on all columns | Unique rows based on specified columns |

In [None]:
# na
# it is not a full method, it is infact an accessor to othe sub methods

dataframe.na.drop(how = 'all').show(8, truncate=False)
dataframe.na.fill('Missing').show(8, truncate=False)

dataframe.na.replace('',None).show(truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+-------------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city         |
+-----------+-------+-----------+--------+---+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75|32 |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0 |29 |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8 |31 |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4 |40 |true     |2017-08-25|NULL         |
|7          |NULL   |Engineering|68000.6 |27 |false    |2022-02-14|Seattle      |
|8          |Grace  |Marketing  |NULL    |33 |true     |2019-09-08|Austin       |
+-----------+-------+-----------+--------+---+---------+----------+-------------+
only showing top

| Method | Purpose |
|--------|---------|
| `.na.fill()` | Replace **nulls** with values |
| `.na.drop()` | Remove **null** rows |
| `.na.replace()` | Replace **non-null values** with other values |

So `.na.replace()` is for **changing existing data values**, not for handling missing data!

In [None]:
from pyspark.sql.types import NullType

dataframe.fillna('missing').show(truncate = False)
dataframe.na.replace('',None).fillna('missing').show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|6          |Frank  |missing    |71000.4   |40  |true     |2017-08-25|missing      |
|7          |missing|Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-1

Note : `DataFame.na.fill()` and `DataFrame.fillna()` both are same methods in terms of the functionality

In [None]:
# dropna

dataframe.dropna().show(5,truncate = False)
dataframe.dropna(how = 'all').show(5,truncate = False)
dataframe.dropna(subset = ['name']).show(5,truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+-------------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city         |
+-----------+-------+-----------+--------+---+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75|32 |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0 |29 |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8 |31 |false    |2020-11-30|Boston       |
+-----------+-------+-----------+--------+---+---------+----------+-------------+
only showing top 5 rows

+-----------+-------+-----------+--------+---+---------+----------+-------------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city         |
+-----------+-------+-----------+--------+---+---------+----------+------

Note : `DataFrame.na.drop` and `DataFrame.dropna` are the same methods in terms of the functionality

In [None]:
# groupBy

# Note : counnt as a method and count as function.
# count() method : row counts
# count function within agg methos, can count and expression or column with only non-null values

from pyspark.sql.functions import count

dataframe.groupBy('department').count().show() ## not possible to give alias directly

dataframe.groupBy('department').count().withColumnRenamed('count','name_count').show()

dataframe.groupBy('department').agg(count(col('name')).alias('name_count')).show()

+-----------+-----+
| department|count|
+-----------+-----+
|      Sales|    3|
|Engineering|    5|
|         HR|    2|
|       NULL|    2|
|  Marketing|    3|
+-----------+-----+

+-----------+----------+
| department|name_count|
+-----------+----------+
|      Sales|         3|
|Engineering|         5|
|         HR|         2|
|       NULL|         2|
|  Marketing|         3|
+-----------+----------+

+-----------+----------+
| department|name_count|
+-----------+----------+
|      Sales|         3|
|Engineering|         4|
|         HR|         2|
|       NULL|         2|
|  Marketing|         3|
+-----------+----------+



In [None]:
# agg

from pyspark.sql.functions import expr,sum, avg

dataframe.agg(expr('sum(salary) as total_salary'),expr(('avg(salary) as avg_salary'))).show(truncate = False)

dataframe.agg(sum('salary').alias('total_salary'),avg('salary').alias('avg_salary')).show(truncate = False)

dataframe.select(sum('salary').alias('total_salary'),avg('salary').alias('avg_salary')).show(truncate = False) ## select also can produce the same result

+------------------+------------------+
|total_salary      |avg_salary        |
+------------------+------------------+
|1795005.9899999998|128214.71357142855|
+------------------+------------------+

+------------------+------------------+
|total_salary      |avg_salary        |
+------------------+------------------+
|1795005.9899999998|128214.71357142855|
+------------------+------------------+

+------------------+------------------+
|total_salary      |avg_salary        |
+------------------+------------------+
|1795005.9899999998|128214.71357142855|
+------------------+------------------+



In [None]:
# groupBy + agg

from pyspark.sql.functions import expr,sum, avg

dataframe.groupBy('department').agg(expr('sum(salary) as total_salary'),expr(('avg(salary) as avg_salary'))).show(truncate = False)

dataframe.groupBy('department').agg(sum('salary').alias('total_salary'),avg('salary').alias('avg_salary')).show(truncate = False)

+-----------+------------+-----------------+
|department |total_salary|avg_salary       |
+-----------+------------+-----------------+
|Sales      |1117001.89  |372333.9633333333|
|Engineering|320001.35   |64000.27         |
|HR         |125001.1    |62500.55         |
|NULL       |123000.9    |61500.45         |
|Marketing  |110000.75   |55000.375        |
+-----------+------------+-----------------+

+-----------+------------+-----------------+
|department |total_salary|avg_salary       |
+-----------+------------+-----------------+
|Sales      |1117001.89  |372333.9633333333|
|Engineering|320001.35   |64000.27         |
|HR         |125001.1    |62500.55         |
|NULL       |123000.9    |61500.45         |
|Marketing  |110000.75   |55000.375        |
+-----------+------------+-----------------+



**Note:**
* For aggregation operations on the entire DataFrame (without grouping), `agg()` and `select()` can be used interchangeably and produce identical results
* After `groupBy()`, you can only use `agg()` for aggregations - the `select()` method is not available on `GroupedData` objects
* `groupBy()` returns a `GroupedData` object which has different methods available than a regular `DataFrame`

In [None]:
# rollup

from pyspark.sql.functions import asc_nulls_last, desc_nulls_last

dataframe.rollup(['department']).agg(count('name').alias('rollupCount'))\
         .sort(asc_nulls_last('department'))\
         .show(truncate = False)

+-----------+-----------+
|department |rollupCount|
+-----------+-----------+
|Engineering|4          |
|HR         |2          |
|Marketing  |3          |
|Sales      |3          |
|NULL       |2          |
|NULL       |14         |
+-----------+-----------+



In [None]:
# rollup + grouping + grouping_id

from pyspark.sql.functions import grouping, grouping_id, desc

dataframe.rollup(['department'])\
         .agg(
              count('name').alias('rollupCount'),\
              grouping_id().alias('grouping_id'),
              grouping('department').alias('department_grouping')
              )\
         .sort(asc_nulls_last('department'))\
         .show(truncate = False)


+-----------+-----------+-----------+-------------------+
|department |rollupCount|grouping_id|department_grouping|
+-----------+-----------+-----------+-------------------+
|Engineering|4          |0          |0                  |
|HR         |2          |0          |0                  |
|Marketing  |3          |0          |0                  |
|Sales      |3          |0          |0                  |
|NULL       |2          |0          |0                  |
|NULL       |14         |1          |1                  |
+-----------+-----------+-----------+-------------------+



##### **Spark DataFrame** : `groupBy()` vs `rollup()`

| Feature | `groupBy()` | `rollup()` |
|---------|-------------|------------|
| **Basic Purpose** | Groups data by specified columns for aggregation | Creates hierarchical subtotals and grand total |
| **Output Levels** | Single level of aggregation | Multiple levels (detailed ‚Üí subtotals ‚Üí grand total) |
| **Number of Result Rows** | One row per unique combination of grouping columns | Multiple rows per combination (n+1 levels where n=number of columns) |
| **NULL Handling** | NULLs are treated as distinct groups | NULLs represent aggregated levels |
| **Performance** | Faster for simple groupings | More expensive due to multiple aggregation levels |
| **Use Cases** | Simple aggregations, distinct counts | Financial reports, hierarchical data, business intelligence |
| **Syntax** | `df.groupBy("col1", "col2").agg(...)` | `df.rollup("col1", "col2").agg(...)` |
| **Grouping ID** | Not applicable (always 0) | Essential for identifying aggregation levels |

`grouping(col)`

* Returns 1 if the column is aggregated (NULL in result)
* Returns 0 if the column is present in the current grouping level
* Used to identify which specific columns are aggregated

`grouping_id()`

* Returns a bitmask (integer) representing the aggregation level
* Each bit corresponds to a column in rollup/cube (in reverse order)
* 0 = column present, 1 = column aggregated
* More efficient than multiple grouping() calls

`Simple Rule:`

* Use grouping(col) to check individual columns
* Use grouping_id() to identify the complete aggregation level
* Essential for working with rollup() and cube() operations

In [None]:
# cube

print('--rollup--\n')

dataframe.rollup('department', 'city')\
         .agg(count('name').alias('count_names'),
              grouping_id().alias('grouping_id'),
              grouping('department').alias('grouping_department'),
              grouping('city').alias('grouping_id_city'))\
         .orderBy(asc_nulls_last('department'),asc_nulls_last('city'))\
         .show(truncate = False)

print('--cube--\n')

dataframe.cube('department', 'city')\
         .agg(count('name').alias('count_names'),
              grouping_id().alias('grouping_id'),
              grouping('department').alias('grouping_department'),
              grouping('city').alias('grouping_id_city'))\
         .orderBy(asc_nulls_last('department'),asc_nulls_last('city'))\
         .show(truncate = False)

--rollup--

+-----------+-------------+-----------+-----------+-------------------+----------------+
|department |city         |count_names|grouping_id|grouping_department|grouping_id_city|
+-----------+-------------+-----------+-----------+-------------------+----------------+
|Engineering|             |1          |0          |0                  |0               |
|Engineering|New York     |3          |0          |0                  |0               |
|Engineering|Seattle      |0          |0          |0                  |0               |
|Engineering|NULL         |4          |1          |0                  |1               |
|HR         |Boston       |1          |0          |0                  |0               |
|HR         |Portland     |1          |0          |0                  |0               |
|HR         |NULL         |2          |1          |0                  |1               |
|Marketing  |Atlanta      |1          |0          |0                  |0               |
|Marketin

#### Spark Grouping Methods

| Feature | `groupBy()` | `rollup()` | `cube()` |
|---------|-------------|------------|----------|
| **Basic Purpose** | Simple grouping and aggregation | Hierarchical subtotals (drill-down) | All possible dimension combinations |
| **Output Levels** | Single level - only specified combinations | Multiple hierarchical levels | All possible 2‚Åø combinations |
| **Number of Results** | One row per unique group combination | n+1 levels per hierarchy | 2‚Åø combinations (exponential) |
| **NULL Handling** | NULLs treated as regular values | NULLs represent aggregated levels | NULLs represent aggregated levels |
| **Performance** | Fastest - minimal overhead | Moderate - multiple aggregation levels | Slowest - exponential combinations |
| **Use Cases** | Basic reports, simple analytics | Financial reports, organizational hierarchies | Business intelligence, cross-analysis |
| **Syntax** | `df.groupBy("A","B").agg(...)` | `df.rollup("A","B").agg(...)` | `df.cube("A","B").agg(...)` |
| **Grouping ID** | Not needed (always detailed) | Essential for level identification | Essential for combination identification |
| **Hierarchy** | Flat structure | Parent-child relationships | All dimensions independent |
| **Mathematical Formula** | C(n, k) combinations | n+1 levels | 2‚Åø combinations |

#### SQL Grouping Methods Availability Across Databases

| Database | `GROUP BY` | `ROLLUP` | `CUBE` | Syntax Notes |
|----------|------------|----------|--------|--------------|
| **SQL Server** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **PostgreSQL** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **MySQL** | ‚úÖ | ‚úÖ | ‚ùå | `GROUP BY a,b WITH ROLLUP` (limited) |
| **Oracle** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **Redshift** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **Snowflake** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **BigQuery** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **SQLite** | ‚úÖ | ‚ùå | ‚ùå | Basic GROUP BY only |
| **DB2** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **Teradata** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **Spark SQL** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **Trino** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |
| **Databricks** | ‚úÖ | ‚úÖ | ‚úÖ | `GROUP BY ROLLUP(a,b)`, `GROUP BY CUBE(a,b)` |

## Additional Notes:

### **Spark SQL:**
- ‚úÖ Full ANSI SQL support for grouping operations
- ‚úÖ Also available via DataFrame API: `df.rollup()`, `df.cube()`
- ‚úÖ Supports `GROUPING SETS`, `GROUPING()`, `GROUPING_ID()`

### **Trino (formerly PrestoSQL):**
- ‚úÖ Full ANSI SQL compliance
- ‚úÖ Advanced grouping operations
- ‚úÖ Excellent performance for analytical queries

### **Databricks:**
- ‚úÖ Built on Spark SQL - full feature parity
- ‚úÖ Optimized for large-scale analytics
- ‚úÖ Both SQL and DataFrame API support
- ‚úÖ Enhanced performance on Databricks runtime

## Complete Big Data Ecosystem Support:

| Platform | SQL Syntax | DataFrame API | Performance | Use Case |
|----------|------------|---------------|-------------|----------|
| **Spark SQL** | ‚úÖ | ‚úÖ | üü° Good | General big data processing |
| **Trino** | ‚úÖ | ‚ùå | üü¢ Excellent | Interactive analytics |
| **Databricks** | ‚úÖ | ‚úÖ | üü¢ Excellent | Enterprise data analytics |

In [None]:
# .Groupby.pivot

dataframe.groupBy('department').pivot('is_active').count().na.fill(0).show(truncate = False)

dataframe.filter(expr('is_active IS NOT NULL'))\
         .groupBy('department')\
         .pivot('is_active')\
         .count()\
         .na.fill(0)\
         .show(truncate = False)

pivotedDataframe = dataframe.filter(expr('is_active IS NOT NULL'))\
         .groupBy('department')\
         .pivot('is_active')\
         .agg(count(col('name')))\
         .na.fill(0)\

pivotedDataframe.show(truncate = False)

+-----------+----+-----+----+
|department |null|false|true|
+-----------+----+-----+----+
|Sales      |0   |1    |2   |
|Engineering|0   |3    |2   |
|HR         |1   |1    |0   |
|NULL       |0   |0    |2   |
|Marketing  |0   |1    |2   |
+-----------+----+-----+----+

+-----------+-----+----+
|department |false|true|
+-----------+-----+----+
|Sales      |1    |2   |
|Engineering|3    |2   |
|HR         |1    |0   |
|NULL       |0    |2   |
|Marketing  |1    |2   |
+-----------+-----+----+

+-----------+-----+----+
|department |false|true|
+-----------+-----+----+
|Sales      |1    |2   |
|Engineering|2    |2   |
|HR         |1    |0   |
|NULL       |0    |2   |
|Marketing  |1    |2   |
+-----------+-----+----+



Note :
* `Pivot` can only be used along with the `groupBy object`,
* **NOT** with `rollup` or `cube object`

In [None]:
# unpivot

from pyspark.sql.functions import col,lit

pivotedDataframe.unpivot('department',['true','false'],'identifier','values').show(truncate = False)

+-----------+----------+------+
|department |identifier|values|
+-----------+----------+------+
|Sales      |true      |2     |
|Sales      |false     |1     |
|Engineering|true      |2     |
|Engineering|false     |2     |
|HR         |true      |0     |
|HR         |false     |1     |
|NULL       |true      |2     |
|NULL       |false     |0     |
|Marketing  |true      |2     |
|Marketing  |false     |1     |
+-----------+----------+------+



In [None]:
# melt

from pyspark.sql.functions import col,lit

pivotedDataframe.unpivot('department',['true','false'],'identifier','values').show(truncate = False)

+-----------+----------+------+
|department |identifier|values|
+-----------+----------+------+
|Sales      |true      |2     |
|Sales      |false     |1     |
|Engineering|true      |2     |
|Engineering|false     |2     |
|HR         |true      |0     |
|HR         |false     |1     |
|NULL       |true      |2     |
|NULL       |false     |0     |
|Marketing  |true      |2     |
|Marketing  |false     |1     |
+-----------+----------+------+



Note : melt and unpivot methods are functionally the same


In [None]:
# crosstab

dataframe.crosstab('department','is_active').show(truncate = False)

dataframe.crosstab('department','city').show(truncate = False)

+--------------------+-----+----+----+
|department_is_active|false|null|true|
+--------------------+-----+----+----+
|Sales               |1    |0   |2   |
|Engineering         |3    |0   |2   |
|HR                  |1    |1   |0   |
|Marketing           |1    |0   |2   |
|null                |0    |0   |2   |
+--------------------+-----+----+----+

+---------------+---+-------+------+------+-------+------+-----+--------+--------+-------------+-------+----+
|department_city|   |Atlanta|Austin|Boston|Chicago|Denver|Miami|New York|Portland|San Francisco|Seattle|null|
+---------------+---+-------+------+------+-------+------+-----+--------+--------+-------------+-------+----+
|Sales          |0  |0      |0     |0     |1      |1     |1    |0       |0       |0            |0      |0   |
|Engineering    |1  |0      |0     |0     |0      |0     |0    |3       |0       |0            |1      |0   |
|HR             |0  |0      |0     |1     |0      |0     |0    |0       |1       |0            |0 

Note : `crosstab --> grouby.pivot.count()` , or let say frequency

In [None]:
# join

employees_data = [
    (1, "Alice", "IT", 101),
    (2, "Bob", "HR", 102),
    (3, "Charlie", "IT", 101),
    (4, "Diana", "Finance", 103)
]
employees_df = spark.createDataFrame(employees_data, ["emp_id", "name", "dept", "dept_id"])

departments_data = [
    (101, "IT", "New York"),
    (102, "HR", "Chicago"),
    (103, "Finance", "Boston"),
    (104, "Marketing", "Seattle")
]
departments_df = spark.createDataFrame(departments_data, ["dept_id", "dept_name", "location"])

employees_df.show()

departments_df.show()

+------+-------+-------+-------+
|emp_id|   name|   dept|dept_id|
+------+-------+-------+-------+
|     1|  Alice|     IT|    101|
|     2|    Bob|     HR|    102|
|     3|Charlie|     IT|    101|
|     4|  Diana|Finance|    103|
+------+-------+-------+-------+

+-------+---------+--------+
|dept_id|dept_name|location|
+-------+---------+--------+
|    101|       IT|New York|
|    102|       HR| Chicago|
|    103|  Finance|  Boston|
|    104|Marketing| Seattle|
+-------+---------+--------+



In [None]:
# join

employees_df.join(departments_df,
                  employees_df.dept_id == departments_df.dept_id,
                  'inner')\
                  .drop('epartments_df.dept_id')\
                  .show(truncate = False)

employees_df.join(departments_df,
                  employees_df.dept_id == departments_df.dept_id,
                  'left')\
                  .drop('epartments_df.dept_id')\
                  .show(truncate = False)

+------+-------+-------+-------+-------+---------+--------+
|emp_id|name   |dept   |dept_id|dept_id|dept_name|location|
+------+-------+-------+-------+-------+---------+--------+
|1     |Alice  |IT     |101    |101    |IT       |New York|
|3     |Charlie|IT     |101    |101    |IT       |New York|
|2     |Bob    |HR     |102    |102    |HR       |Chicago |
|4     |Diana  |Finance|103    |103    |Finance  |Boston  |
+------+-------+-------+-------+-------+---------+--------+

+------+-------+-------+-------+-------+---------+--------+
|emp_id|name   |dept   |dept_id|dept_id|dept_name|location|
+------+-------+-------+-------+-------+---------+--------+
|1     |Alice  |IT     |101    |101    |IT       |New York|
|2     |Bob    |HR     |102    |102    |HR       |Chicago |
|4     |Diana  |Finance|103    |103    |Finance  |Boston  |
|3     |Charlie|IT     |101    |101    |IT       |New York|
+------+-------+-------+-------+-------+---------+--------+



In [None]:
# crossjoin

employees_df.crossJoin(departments_df).show(truncate = False)

+------+-------+-------+-------+-------+---------+--------+
|emp_id|name   |dept   |dept_id|dept_id|dept_name|location|
+------+-------+-------+-------+-------+---------+--------+
|1     |Alice  |IT     |101    |101    |IT       |New York|
|1     |Alice  |IT     |101    |102    |HR       |Chicago |
|2     |Bob    |HR     |102    |101    |IT       |New York|
|2     |Bob    |HR     |102    |102    |HR       |Chicago |
|1     |Alice  |IT     |101    |103    |Finance  |Boston  |
|1     |Alice  |IT     |101    |104    |Marketing|Seattle |
|2     |Bob    |HR     |102    |103    |Finance  |Boston  |
|2     |Bob    |HR     |102    |104    |Marketing|Seattle |
|3     |Charlie|IT     |101    |101    |IT       |New York|
|3     |Charlie|IT     |101    |102    |HR       |Chicago |
|4     |Diana  |Finance|103    |101    |IT       |New York|
|4     |Diana  |Finance|103    |102    |HR       |Chicago |
|3     |Charlie|IT     |101    |103    |Finance  |Boston  |
|3     |Charlie|IT     |101    |104    |

In [None]:
# union

dataframe1_data = [
    (1, "Alice", "IT", 5000, "New York"),
    (2, "Bob", "HR", 4000, "Chicago"),
    (3, "Charlie", "IT", 4500, "New York"),
    (3, "Charlie", "IT", 4500, "New York"),
    (4, "Diana", "Finance", 6000, "Boston"),
    (5, "Eve", "Marketing", 5500, "Seattle")
]
dataframe1 = spark.createDataFrame(dataframe1_data, ["id", "name", "department", "salary", "city"])

dataframe2_data = [
    (3, "Charlie", "IT", 4500, "New York"),  # Duplicate
    (6, "Frank", "IT", 5200, "Austin"),      # New
    (7, "Grace", "HR", 4200, "Chicago"),     # New
    (4, "Diana", "Finance", 6000, "Boston"), # Duplicate
    (8, "Henry", "Sales", 4800, "Miami"),     # New
    (3, "Charlie", "IT", 4500, "New York")
]
dataframe2 = spark.createDataFrame(dataframe2_data, ["id", "name", "department", "salary", "city"])

In [None]:
# union

dataframe1.union(dataframe2).show(truncate = False)

+---+-------+----------+------+--------+
|id |name   |department|salary|city    |
+---+-------+----------+------+--------+
|1  |Alice  |IT        |5000  |New York|
|2  |Bob    |HR        |4000  |Chicago |
|3  |Charlie|IT        |4500  |New York|
|3  |Charlie|IT        |4500  |New York|
|4  |Diana  |Finance   |6000  |Boston  |
|5  |Eve    |Marketing |5500  |Seattle |
|3  |Charlie|IT        |4500  |New York|
|6  |Frank  |IT        |5200  |Austin  |
|7  |Grace  |HR        |4200  |Chicago |
|4  |Diana  |Finance   |6000  |Boston  |
|8  |Henry  |Sales     |4800  |Miami   |
|3  |Charlie|IT        |4500  |New York|
+---+-------+----------+------+--------+



In [None]:
# unionall
dataframe1.unionAll(dataframe2).show(truncate = False)

+---+-------+----------+------+--------+
|id |name   |department|salary|city    |
+---+-------+----------+------+--------+
|1  |Alice  |IT        |5000  |New York|
|2  |Bob    |HR        |4000  |Chicago |
|3  |Charlie|IT        |4500  |New York|
|3  |Charlie|IT        |4500  |New York|
|4  |Diana  |Finance   |6000  |Boston  |
|5  |Eve    |Marketing |5500  |Seattle |
|3  |Charlie|IT        |4500  |New York|
|6  |Frank  |IT        |5200  |Austin  |
|7  |Grace  |HR        |4200  |Chicago |
|4  |Diana  |Finance   |6000  |Boston  |
|8  |Henry  |Sales     |4800  |Miami   |
|3  |Charlie|IT        |4500  |New York|
+---+-------+----------+------+--------+



**Note:**
- In **Spark DataFrame API**, `union()` and `unionAll()` are identical - both keep duplicates
- In **Spark SQL**, `UNION` removes duplicates while `UNION ALL` keeps duplicates
- This is a known inconsistency between DataFrame API and SQL in Spark

In [None]:
# unionByName

dataframe1_reordered = dataframe1.select('name','department','id')
dataframe2_reordered = dataframe1.select('id','name','department')

dataframe1_reordered.unionByName(dataframe2_reordered).show(truncate= False)

+-------+----------+---+
|name   |department|id |
+-------+----------+---+
|Alice  |IT        |1  |
|Bob    |HR        |2  |
|Charlie|IT        |3  |
|Charlie|IT        |3  |
|Diana  |Finance   |4  |
|Eve    |Marketing |5  |
|Alice  |IT        |1  |
|Bob    |HR        |2  |
|Charlie|IT        |3  |
|Charlie|IT        |3  |
|Diana  |Finance   |4  |
|Eve    |Marketing |5  |
+-------+----------+---+



Note
* `union()` = "Strict mode" - schemas must be identical
* `unionByName()` = "Flexible mode" - schemas can differ
* `unionByName()` with allowMissingColumns=True = "Forgiving mode" - handles schema evolution

Use unionByName() when working with data from multiple sources or when schemas might change over time!

In [None]:
# intersect

dataframe1.intersect(dataframe2).show(truncate = False)

+---+-------+----------+------+--------+
|id |name   |department|salary|city    |
+---+-------+----------+------+--------+
|4  |Diana  |Finance   |6000  |Boston  |
|3  |Charlie|IT        |4500  |New York|
+---+-------+----------+------+--------+



In [None]:
# intersectAll

dataframe1.intersectAll(dataframe2).show(truncate = False)

+---+-------+----------+------+--------+
|id |name   |department|salary|city    |
+---+-------+----------+------+--------+
|3  |Charlie|IT        |4500  |New York|
|3  |Charlie|IT        |4500  |New York|
|4  |Diana  |Finance   |6000  |Boston  |
+---+-------+----------+------+--------+



### Intersect vs IntersectAll

##### `intersect()`
- **Shows only unique common records**
- Removes all duplicates  
- **Answers**: "Which records are common?"

##### `intersectAll()`
- **Shows all common records with duplicates**
- Keeps duplicate counts
- **Answers**: "How many times are records common?"

##### Simple Examples:

##### Example 1: `[A, A, B] ‚à© [A, B, B]`
- **`intersect()`** = `[A, B]` (only unique)
- **`intersectAll()`** = `[A, B]` (min counts: A=1, B=1)

##### Example 2: `[A, A, B] ‚à© [A, A, B]`  
- **`intersect()`** = `[A, B]`
- **`intersectAll()`** = `[A, A, B]` (min counts: A=2, B=1)

##### When to Use:
- **Use `intersect()`** for checking existence
- **Use `intersectAll()`** for counting occurrences

In [None]:
# exceptAll

dataframe1.exceptAll(dataframe2).show(truncate = False)
dataframe2.exceptAll(dataframe1).show(truncate = False)

+---+-----+----------+------+--------+
|id |name |department|salary|city    |
+---+-----+----------+------+--------+
|1  |Alice|IT        |5000  |New York|
|2  |Bob  |HR        |4000  |Chicago |
|5  |Eve  |Marketing |5500  |Seattle |
+---+-----+----------+------+--------+

+---+-----+----------+------+-------+
|id |name |department|salary|city   |
+---+-----+----------+------+-------+
|6  |Frank|IT        |5200  |Austin |
|7  |Grace|HR        |4200  |Chicago|
|8  |Henry|Sales     |4800  |Miami  |
+---+-----+----------+------+-------+



In [None]:
# subtract

dataframe1.subtract(dataframe2).show(truncate = False)
dataframe2.subtract(dataframe1).show(truncate = False)

+---+-----+----------+------+--------+
|id |name |department|salary|city    |
+---+-----+----------+------+--------+
|5  |Eve  |Marketing |5500  |Seattle |
|1  |Alice|IT        |5000  |New York|
|2  |Bob  |HR        |4000  |Chicago |
+---+-----+----------+------+--------+

+---+-----+----------+------+-------+
|id |name |department|salary|city   |
+---+-----+----------+------+-------+
|6  |Frank|IT        |5200  |Austin |
|7  |Grace|HR        |4200  |Chicago|
|8  |Henry|Sales     |4800  |Miami  |
+---+-----+----------+------+-------+



Note : ExceptAll and subtract are functionally the same thing

In [None]:
# sample

dataframe.sample(fraction = 0.1).show(truncate = False)
dataframe.sample(fraction = 0.1, seed = 50).show(truncate = False)

+-----------+----+-----------+-------+---+---------+----------+-------+
|employee_id|name|department |salary |age|is_active|hire_date |city   |
+-----------+----+-----------+-------+---+---------+----------+-------+
|7          |NULL|Engineering|68000.6|27 |false    |2022-02-14|Seattle|
|15         |Mona|NULL       |52000.5|26 |true     |2022-08-09|Chicago|
+-----------+----+-----------+-------+---+---------+----------+-------+

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York|
|8          |Grace  |Marketing  |NULL    |33 |true     |2019-09-08|Austin  |
+-----------+-------+-----------+--------+---+---------+----------+--------+



In [None]:
# sampleBy : stratified sampling by a specifed column

dataframe.sampleBy(col = 'department',fractions={'Engineering': 0.8, 'Marketing': 0.1}, seed = 50).show(truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|1          |Alice  |Engineering|75000.5 |28 |true     |2020-01-15|New York|
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York|
|7          |NULL   |Engineering|68000.6 |27 |false    |2022-02-14|Seattle |
|8          |Grace  |Marketing  |NULL    |33 |true     |2019-09-08|Austin  |
|11         |       |Engineering|0.0     |0  |false    |2023-01-01|        |
|14         |Leo    |Engineering|95000.0 |45 |false    |2016-04-18|New York|
+-----------+-------+-----------+--------+---+---------+----------+--------+



In [None]:
# randomSplit : randomly splits the dataframe based on a weight returns a list of dataframes

splits = dataframe.randomSplit(weights=[0.2,0.6,0.2])

print(splits,'\n')

print([type(x) for x in splits],'\n')

splits[0].show()

print([x.count() for x in splits])


[DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string], DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string], DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]] 

[<class 'pyspark.sql.dataframe.DataFrame'>, <class 'pyspark.sql.dataframe.DataFrame'>, <class 'pyspark.sql.dataframe.DataFrame'>] 

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|   name| department|  salary|age|is_active| hire_date|    city|
+-----------+-------+-----------+--------+---+---------+----------+--------+
|          3|Charlie|Engineering|82000.25| 35|     true|2018-07-10|New York|
|          4|  Diana|      Sales| 58000.0| 29|     true|2021-05-05| Chicago|
|         15|   Mona|       NULL| 52000.5| 26|     true|2022

#### Spark Sampling Methods Comparison

| Feature | `sample()` | `sampleBy()` | `randomSplit()` |
|---------|------------|--------------|-----------------|
| **Purpose** | Random sampling from entire dataset | Stratified sampling by groups | Split dataset into multiple parts |
| **Sampling Type** | Simple random sampling | Stratified random sampling | Multiple random splits |
| **Control Level** | Dataset level | Group/column level | Dataset level |
| **Output** | Single DataFrame | Single DataFrame | List of DataFrames |
| **Fractions Parameter** | Single fraction for all data | Dict of fractions per group | List of weights for splits |
| **Usage** | `df.sample(0.3)` | `df.sampleBy("dept", {'DataEnginering':0.1, 'Analytics':0.2})` | `df.randomSplit([0.6, 0.4])` |
| **Group Proportionality** | ‚ùå Not maintained | ‚úÖ Maintained | ‚ùå Not maintained |
| **Use Case** | Quick random subset | Representative samples by category | Train/validation/test splits |

In [None]:
# repartition

from pyspark.sql.functions import spark_partition_id

dataframe.repartition(3)\
         .select('*', spark_partition_id().alias('spark_partition_id')).show(truncate=False)  ## with numPartitions

dataframe.repartition('department')\
         .select('*', spark_partition_id().alias('spark_partition_id')).show(truncate = False)  ## with partitionColumns

dataframe.repartition(3,'department')\
         .select('*', spark_partition_id().alias('spark_partition_id')).show(truncate = False)  ## with numPartitions, partitionColumns

+-----------+-------+-----------+----------+----+---------+----------+-------------+------------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |spark_partition_id|
+-----------+-------+-----------+----------+----+---------+----------+-------------+------------------+
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |0                 |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |0                 |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |0                 |
|15         |Mona   |NULL       |52000.5   |26  |true     |2022-08-09|Chicago      |0                 |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |0                 |
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |1                 |
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-

---
#### repartition() Methods
---
| Aspect | `repartition(N)` | `repartition("col")` | `repartition(N, "col")` |
|--------|------------------|---------------------|------------------------|
| **Conceptual Purpose** | Balance data across N partitions | Group same column values together | Group values + control partition count |
| **Data Distribution** | Random shuffle | Hash-based by column value | Hash-based by column value |
| **Partition Count** | Exactly N partitions | Default (200) partitions | Exactly N partitions |
| **Data Locality** | No guarantee - random | Same values in same partition | Same values in same partition |
| **Shuffle Operation** | Full shuffle (expensive) | Full shuffle (expensive) | Full shuffle (expensive) |
| **Performance Impact** | High (shuffles all data) | High (shuffles all data) | High (shuffles all data) |
---
#### Technical Details
---

| Aspect | `repartition(N)` | `repartition("col")` | `repartition(N, "col")` |
|--------|------------------|---------------------|------------------------|
| **Internal Logic** | `HashPartitioner(random)` | `HashPartitioner(col.hash)` | `HashPartitioner(col.hash % N)` |
| **Data Skew Risk** | Low (random distribution) | High (if column values uneven) | High (if N < distinct values) |
| **Query Optimization** | Limited benefit | Enables partition pruning | Enables partition pruning |
| **Memory Usage** | Balanced across executors | Depends on value distribution | Depends on value distribution |
| **Default N** | User-specified | 200 (spark.sql.shuffle.partitions) | User-specified |
---
#### Use Case Scenarios
---

| Aspect | `repartition(N)` | `repartition("col")` | `repartition(N, "col")` |
|--------|------------------|---------------------|------------------------|
| **Best For** | General load balancing | Filtering/joining on specific column | Optimized storage + query performance |
| **Example Scenario** | Before expensive operations | Before `filter(col=value)` or `join(on=col)` | Before writing partitioned data to disk |
| **When to Avoid** | Small datasets, frequent operations | High-cardinality columns, small datasets | When N < distinct values causing skew |
---
#### Performance Characteristics
---

| Aspect | `repartition(N)` | `repartition("col")` | `repartition(N, "col")` |
|--------|------------------|---------------------|------------------------|
| **Shuffle Cost** | High | High | High |
| **Subsequent Operation Speed** | Moderate improvement | Significant improvement for column-based ops | Best for targeted operations |
| **Storage Efficiency** | Poor | Good for partitioned storage | Excellent for partitioned storage |
| **Risk of Data Skew** | Low | Medium | High if poor N choice |

In [None]:
#repartitionByRange

dataframe.repartitionByRange(3,'department')\
         .select('*', spark_partition_id().alias('spark_partition_id')).show(truncate = False)  ## with numPartitions, partitionColumns

+-----------+-------+-----------+----------+----+---------+----------+-------------+------------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |spark_partition_id|
+-----------+-------+-----------+----------+----+---------+----------+-------------+------------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |0                 |
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |0                 |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |0                 |
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |0                 |
|11         |       |Engineering|0.0       |0   |false    |2023-01-01|             |0                 |
|14         |Leo    |Engineering|95000.0   |45  |false    |2016-04-18|New York     |0                 |
|15         |Mona   |NULL       |52000.5   |26  |true     |2022-

#### `repartition()` vs `repartitionByRange()`

| Aspect | `repartition()` | `repartitionByRange()` |
|--------|-----------------|------------------------|
| **Partitioning Method** | Hash partitioning | Range partitioning |
| **Data Distribution** | Based on hash codes of column values | Based on actual value ranges |
| **Ordering** | No ordering within partitions | Values are sorted within range boundaries |
| **Use Case** | General data distribution, joins, aggregations | Sorting, window operations, range queries |
| **Performance** | Faster for equal distribution | Slower (requires sampling and sorting) |

In [None]:
# coalesce

dataframe.coalesce(3)\
         .select('*', spark_partition_id().alias('spark_partition_id')).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+------------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |spark_partition_id|
+-----------+-------+-----------+----------+----+---------+----------+-------------+------------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |0                 |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|0                 |
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |0                 |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |0                 |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |0                 |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |0                 |
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-

#### repartition() vs coalesce()

#### Core Difference
- **`repartition()`**: Full shuffle, increases/decreases partitions, perfect data balance
- **`coalesce()`**: No shuffle, decreases partitions only, minimal data movement

#### When repartition() is Essential (Not Optional)

#### 1. **Preventing Data Skew**
- Critical for joins on skewed keys (90% users in one country)
- Avoids single executor processing majority of data
- Essential for stable join operations

#### 2. **Memory Management & OOM Prevention**
- Large partitions can crash executors with OutOfMemory errors
- Ensures predictable memory usage across cluster
- Required for production job stability

#### 3. **Optimizing Cluster Utilization**
- Enables maximum parallel processing
- Prevents underutilized CPU cores
- Essential for cost-effective cloud computing

#### 4. **Performance-Critical Operations**
- Before complex aggregations and window functions
- For time-sensitive production pipelines
- When job completion SLAs must be met

#### When to Use Each

| Scenario | Use | Reason |
|----------|-----|--------|
| **Increasing partitions** | `repartition()` | `coalesce()` cannot increase |
| **Data skew present** | `repartition()` | Essential for job stability |
| **Memory concerns** | `repartition()` | Prevents OOM errors |
| **Before expensive joins** | `repartition()` | Ensures balanced execution |
| **Reducing partitions after filter** | `coalesce()` | No shuffle needed |
| **Writing to storage** | `coalesce()` | Avoids small files efficiently |
| **Simple partition reduction** | `coalesce()` | Fast, minimal overhead |

#### Performance & Impact

| Aspect | `repartition()` | `coalesce()` |
|--------|-----------------|--------------|
| **Execution Speed** | Slow (shuffle) | Fast (no shuffle) |
| **Network I/O** | High | None/Low |
| **Data Balance** | Perfect | Partial |
| **Job Reliability** | High | Risk of skew/OOM |
| **Memory Safety** | Excellent | Potentially risky |

#### Critical Decision Points

#### Use `repartition()` when:
- Data distribution is uneven (skew)
- Memory errors occur in current job
- Joining on high-cardinality columns
- Maximum parallelism required
- Production job stability is critical

#### Use `coalesce()` when:
- Simply reducing partition count
- After filtering/aggregation operations
- Writing output files
- Performance is primary concern
- Data is already well-distributed

#### Key Insight
**`repartition()` is not just an optimization - it's essential for reliable Spark job execution with real-world data. While `coalesce()` is more efficient, `repartition()` ensures your jobs don't fail due to skew or memory issues.**

**Choose: `coalesce()` for speed, `repartition()` for reliability**

In [None]:
# cache

dataframe.cache()

DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]

In [None]:
dataframe.show(truncate= False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-1

What is cache()?

> cache() is a method that persists a DataFrame in memory across Spark executors for faster repeated access.

Key Characteristics of `cache`

* Storage Level	MEMORY_AND_DISK (default)
* Persistence	Survives Spark operations, not Spark restarts
* Lazy Evaluation	Cache happens on first action, not immediately
* Memory Management	LRU eviction when memory is full
* When to Use cache()

‚úÖ Use cache() when:

* Multiple actions on same DataFrame
* Iterative algorithms (ML training)
* Repeated transformations on same base data
* Interactive analysis and debugging
* Complex DAGs with reused DataFrames

‚ùå Avoid cache() when:

* Single use DataFrames
* Very large datasets that don't fit in memory
* Simple linear workflows
* Memory-constrained environments

In [None]:
# persist

dataframe.persist()

DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]

In [None]:
dataframe.show(truncate=False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-1

In [None]:
# unpersist

dataframe.unpersist()

DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]

In [None]:
from pyspark import StorageLevel
dataframe.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]

In [None]:
dataframe.unpersist()

DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]

`persist()` is a method that allows you to explicitly specify the storage level for persisting a DataFrame, giving you fine-grained control over how data is cached.

---
##### DataFrame.cache() vs persist()

| Aspect | `cache()` | `persist()` | Notes |
|--------|-----------|-------------|-------|
| **Storage Level** | Fixed: `MEMORY_AND_DISK` | Customizable: Any StorageLevel | |
| **Flexibility** | Limited | High | |
| **Use Case** | Simple caching | Advanced memory management | |
| **Syntax** | `df.cache()` | `df.persist()` or `df.persist(StorageLevel.X)` | |
| **Default Behavior** | MEMORY_AND_DISK | MEMORY_AND_DISK (when no parameter) | |
| **Memory Control** | Basic | Granular | |
| **Performance Tuning** | Limited | Extensive | |
| **Unpersist Counterpart** | ‚úÖ `df.unpersist()` | ‚úÖ `df.unpersist()` | **Both use same unpersist()** |
| **Cleanup Method** | Same unpersist() | Same unpersist() | No separate "uncache()" |
---
##### Important Clarification

**Both `cache()` and `persist()` use the SAME `unpersist()` method** - there is no separate `uncache()` method.
---
##### Usage Pattern:
```python
# For cache()
df.cache()
df.unpersist()  # Removes from cache
---
# For persist()  
df.persist(StorageLevel.MEMORY_ONLY)
df.unpersist()  # Removes from persistence

# Both work exactly the same way!

In [None]:
# checkpoint

spark.sparkContext.setCheckpointDir("/tmp/checkpoints")

dataframe.checkpoint(True)

DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]

In [None]:
# localCheckpoints

dataframe.localCheckpoint(True)

DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]

In [None]:
dataframe.unpersist() ## this cleans up the local checkpoints

DataFrame[employee_id: int, name: string, department: string, salary: double, age: int, is_active: boolean, hire_date: date, city: string]

In [None]:
import atexit # this cleans up the path check points

# Register cleanup function
def cleanup_checkpoints():
    checkpoint_dir = "/tmp/checkpoints"
    if os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
        print("Checkpoints cleaned up")

atexit.register(cleanup_checkpoints)  # Run on exit

* What is checkpoint()? *
---
> checkpoint() is a method that saves a DataFrame to reliable storage (like HDFS) and breaks the lineage graph, providing fault tolerance and preventing expensive recomputations in case of failure occurs.
---
‚úÖ Use checkpoint() when:

* Long, complex transformation chains
* Iterative algorithms (ML, graph processing)
* Memory-intensive operations
* Debugging complex pipelines
* Production reliability requirements
* Streaming applications

‚ùå Avoid checkpoint() when:

* Simple, fast transformations
* Limited storage available
* Development/testing environments
* Linear pipelines with no iterations

`checkpoint` vs `localCheckpoint`

* checkpoint(): Saves to reliable storage (HDFS, S3), survives driver/executor failures
* localCheckpoint(): Saves to executor local storage, faster but less reliable

In [None]:
# corr

# returns a scaler output

dataframe.corr('salary','age')

0.854241841921367

In [None]:
# cov

# returns a scaler output

dataframe.cov('salary','age')

4672573.763

In [None]:
# freqItems

dataframe.freqItems(['department']).show(truncate = False)
dataframe.freqItems(['department'], support=0.5).show(truncate = False) # with a threshold

+-----------------------------------------+
|department_freqItems                     |
+-----------------------------------------+
|[HR, Engineering, Marketing, NULL, Sales]|
+-----------------------------------------+

+--------------------+
|department_freqItems|
+--------------------+
|[Engineering, NULL] |
+--------------------+



> freqItems() Threshold Explained
Default Threshold

* By default, freqItems() uses support = 0.01 (1%)

This means:
* Any value that appears in ‚â•1% of the rows is included
* Values appearing in <1% of rows are excluded

In [None]:
# stat
# accessor of the dataframe

dataframe.stat.corr('age','salary')

0.854241841921367

In [None]:
dataframe.stat.cov('age','salary')

4672573.763

In [None]:
dataframe.stat.crosstab('department','city').show()

+---------------+---+-------+------+------+-------+------+-----+--------+--------+-------------+-------+----+
|department_city|   |Atlanta|Austin|Boston|Chicago|Denver|Miami|New York|Portland|San Francisco|Seattle|null|
+---------------+---+-------+------+------+-------+------+-----+--------+--------+-------------+-------+----+
|          Sales|  0|      0|     0|     0|      1|     1|    1|       0|       0|            0|      0|   0|
|    Engineering|  1|      0|     0|     0|      0|     0|    0|       3|       0|            0|      1|   0|
|             HR|  0|      0|     0|     1|      0|     0|    0|       0|       1|            0|      0|   0|
|      Marketing|  0|      1|     1|     0|      0|     0|    0|       0|       0|            1|      0|   0|
|           null|  0|      0|     0|     0|      1|     0|    0|       0|       0|            0|      0|   1|
+---------------+---+-------+------+------+-------+------+-----+--------+--------+-------------+-------+----+



In [None]:
dataframe.stat.freqItems(['department'], support=0.4).show(truncate = False)

+--------------------+
|department_freqItems|
+--------------------+
|[Engineering, NULL] |
+--------------------+



In [None]:
dataframe.stat.sampleBy('department', {'Engineering':0.2, 'Marketing':0.3}, seed = 50).show(truncate = False)

+-----------+-------+-----------+--------+---+---------+----------+--------+
|employee_id|name   |department |salary  |age|is_active|hire_date |city    |
+-----------+-------+-----------+--------+---+---------+----------+--------+
|3          |Charlie|Engineering|82000.25|35 |true     |2018-07-10|New York|
|8          |Grace  |Marketing  |NULL    |33 |true     |2019-09-08|Austin  |
+-----------+-------+-----------+--------+---+---------+----------+--------+



In [None]:
# toPandas

print(type(dataframe.toPandas()),'\n')
dataframe.toPandas()

<class 'pandas.core.frame.DataFrame'> 



Unnamed: 0,employee_id,name,department,salary,age,is_active,hire_date,city
0,1,Alice,Engineering,75000.5,28.0,True,2020-01-15,New York
1,2,Bob,Marketing,65000.75,32.0,False,2019-03-20,San Francisco
2,3,Charlie,Engineering,82000.25,35.0,True,2018-07-10,New York
3,4,Diana,Sales,58000.0,29.0,True,2021-05-05,Chicago
4,5,Eve,HR,62000.8,31.0,False,2020-11-30,Boston
5,6,Frank,,71000.4,40.0,True,2017-08-25,
6,7,,Engineering,68000.6,27.0,False,2022-02-14,Seattle
7,8,Grace,Marketing,,33.0,True,2019-09-08,Austin
8,9,Henry,Sales,59000.9,,False,2021-12-01,Denver
9,10,Ivy,HR,63000.3,36.0,,,Portland


#### dataframe.toPandas()

#### What It Is
- Converts **Spark DataFrame** (distributed, cluster) ‚Üí **Pandas DataFrame** (local, single machine)
- Moves **all data** from Spark executors to driver node memory

#### When to Use It

#### ‚úÖ Appropriate Use Cases
1. **Visualization** - Plotting libraries (matplotlib, seaborn) need local data
2. **Small Results** - After aggregations when output is small
3. **Python Ecosystem** - Integration with scikit-learn, statsmodels, etc.
4. **Debugging** - Familiar pandas interface for data inspection
5. **Prototyping** - Quick iteration on data samples

#### ‚ùå When to Avoid
1. **Large Datasets** - Risk of out-of-memory errors
2. **Production Pipelines** - Loses Spark's distributed advantages
3. **Big Data Processing** - Single machine can't handle terabytes

#### Key Trade-offs

| Aspect | Spark DataFrame | Pandas DataFrame |
|--------|-----------------|------------------|
| **Processing** | Distributed | Single machine |
| **Data Size** | Terabytes+ | Limited by RAM |
| **Operations** | Parallel | Single-threaded |
| **Resources** | Cluster | Local |

#### Strategic Usage Pattern
**Process in Spark ‚Üí Convert small results to Pandas ‚Üí Use Python ecosystem**

#### Bottom Line
`toPandas()` is a **bridge between distributed computing and Python data science**, but should be used selectively due to memory constraints.

In [None]:
# pandas_api()
print(type(dataframe.pandas_api()),'\n')

dataframe.pandas_api()

<class 'pyspark.pandas.frame.DataFrame'> 



Unnamed: 0,employee_id,name,department,salary,age,is_active,hire_date,city
0,1,Alice,Engineering,75000.5,28.0,True,2020-01-15,New York
1,2,Bob,Marketing,65000.75,32.0,False,2019-03-20,San Francisco
2,3,Charlie,Engineering,82000.25,35.0,True,2018-07-10,New York
3,4,Diana,Sales,58000.0,29.0,True,2021-05-05,Chicago
4,5,Eve,HR,62000.8,31.0,False,2020-11-30,Boston
5,6,Frank,,71000.4,40.0,True,2017-08-25,
6,7,,Engineering,68000.6,27.0,False,2022-02-14,Seattle
7,8,Grace,Marketing,,33.0,True,2019-09-08,Austin
8,9,Henry,Sales,59000.9,,False,2021-12-01,Denver
9,10,Ivy,HR,63000.3,36.0,,,Portland


#### `toPandas()` vs `pandas_api()`
---
#### Core Architecture
| Aspect | `toPandas()` | `pandas_api()` |
|--------|-------------|----------------|
| **Data Location** | Local memory | Distributed cluster |
| **Processing** | Single machine | Distributed parallel |
| **Data Movement** | All data moves | No data movement |
---
#### Performance & Scale
| Aspect | `toPandas()` | `pandas_api()` |
|--------|-------------|----------------|
| **Max Data Size** | RAM limited | Terabytes+ |
| **Scalability** | Single machine | Cluster scaling |
| **Memory Usage** | High on driver | Distributed |
---
#### Use Cases
| Scenario | `toPandas()` | `pandas_api()` |
|----------|-------------|----------------|
| **Visualization** | ‚úÖ Ideal | ‚ùå Limited |
| **Python ML libs** | ‚úÖ Full support | ‚ùå No support |
| **Big data processing** | ‚ùå Cannot handle | ‚úÖ Perfect |
| **Production pipelines** | ‚ùå Avoid | ‚úÖ Recommended |
| **Prototyping** | ‚úÖ Excellent | ‚úÖ Good |
---
#### Key Trade-offs
---
### `toPandas()`
- ‚úÖ Full pandas compatibility
- ‚úÖ Works with all Python libs
- ‚ùå Memory bound
- ‚ùå Single machine limit
---
#### `pandas_api()`
- ‚úÖ Handles massive data
- ‚úÖ Distributed processing
- ‚ùå Partial pandas compatibility
- ‚ùå Limited third-party support

#### Decision Guide
- **Data fits in RAM** ‚Üí `toPandas()`
- **Need Python ecosystem** ‚Üí `toPandas()`
- **Big data processing** ‚Üí `pandas_api()`
- **Production systems** ‚Üí `pandas_api()`

**Choose based on data size and required ecosystem integration**

In [None]:
# toJSON : majorly used for like straming cases

print(type(dataframe.toJSON()))
print('\n')
dataframe.toJSON().first()

<class 'pyspark.rdd.RDD'>




'{"employee_id":1,"name":"Alice","department":"Engineering","salary":75000.5,"age":28,"is_active":true,"hire_date":"2020-01-15","city":"New York"}'

In [None]:
# toLocalIterator

# same pandas.DataFrame.iterrows()

dataframeIterator = dataframe.toLocalIterator()

for rw in dataframeIterator:
  print(rw)
  print(rw.name)
  break

Row(employee_id=1, name='Alice', department='Engineering', salary=75000.5, age=28, is_active=True, hire_date=datetime.date(2020, 1, 15), city='New York')
Alice


In [None]:
# inputFile

# NOW this dataframe is not created from a source file, the output is empty

dataframe.inputFiles()

[]

In [None]:
# ApproximateQuantile

dataframe.stat.approxQuantile("salary", [0.25, 0.5, 0.75], 0.01)

[58000.0, 63000.3, 75000.5]

In [None]:
# transform

from pyspark.sql.functions import expr

def upperCasing(dataframe,col_name):
  return dataframe.withColumn(col_name, expr(f'upper({col_name})'))

dataframe.transform(upperCasing, 'department').show()

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|   name| department|    salary| age|is_active| hire_date|         city|
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|          1|  Alice|ENGINEERING|   75000.5|  28|     true|2020-01-15|     New York|
|          2|    Bob|  MARKETING|  65000.75|  32|    false|2019-03-20|San Francisco|
|          3|Charlie|ENGINEERING|  82000.25|  35|     true|2018-07-10|     New York|
|          4|  Diana|      SALES|   58000.0|  29|     true|2021-05-05|      Chicago|
|          5|    Eve|         HR|   62000.8|  31|    false|2020-11-30|       Boston|
|          6|  Frank|       NULL|   71000.4|  40|     true|2017-08-25|         NULL|
|          7|   NULL|ENGINEERING|   68000.6|  27|    false|2022-02-14|      Seattle|
|          8|  Grace|  MARKETING|      NULL|  33|     true|2019-09-08|       Austin|
|          9|  Henry|      SALES|   59000.9|NULL|    false|2021-1

In [None]:
# createOrReplaceGlobalTempView

dataframe.createOrReplaceGlobalTempView('dataframe_globalTempView')

sql = '''
select *
from global_temp.dataframe_globalTempView'''

spark.sql(sql).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-1

In [None]:
# createOrReplaceTempView

dataframe.createOrReplaceTempView('dataframe_TempView')

sql = '''
select *
from dataframe_TempView'''

spark.sql(sql).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-1

#### GlobalTempView vs TempView
---
#### Scope & Visibility
| Aspect | **TempView** | **GlobalTempView** |
|--------|-------------|-------------------|
| **Scope** | Current session only | All sessions |
| **SQL Reference** | `view_name` | `global_temp.view_name` |

---
#### Creation & Lifetime
| Aspect | **TempView** | **GlobalTempView** |
|--------|-------------|-------------------|
| **Creation Method** | `createOrReplaceTempView()` | `createOrReplaceGlobalTempView()` |
| **Lifetime** | Session duration | Application duration |
---
#### Use Cases
| Scenario | **TempView** | **GlobalTempView** |
|----------|-------------|-------------------|
| **Single-session work** | ‚úÖ Ideal | ‚ùå Overkill |
| **Cross-session sharing** | ‚ùå Cannot | ‚úÖ Perfect |
| **Pipeline steps** | ‚úÖ Good | ‚úÖ Good for results |
---
#### Key Difference
**TempView**: Session-private, no prefix needed  
**GlobalTempView**: Cluster-shared, requires `global_temp.` prefix
---
#### When to Choose
- **Isolated work** ‚Üí TempView
- **Shared data** ‚Üí GlobalTempView