In [0]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("repartition").getOrCreate()

In [0]:
emp_data = [
    ["001","101","John Doe","30","Male","50000","2015-01-01"],
    ["002","101","Jane Smith","25","Female","45000","2016-02-15"],
    ["003","102","Bob Brown","35","Male","55000","2014-05-01"],
    ["004","102","Alice Lee","28","Female","48000","2017-09-30"],
    ["005","103","Jack Chan","40","Male","60000","2013-04-01"],
    ["006","103","Jill Wong","32","Female","52000","2018-07-01"],
    ["007","101","James Johnson","42","Male","70000","2012-03-15"],
    ["008","102","Kate Kim","29","Female","51000","2019-10-01"],
    ["009","103","Tom Tan","33","Male","58000","2016-06-01"],
    ["010","104","Lisa Lee","27","Female","47000","2018-08-01"],
    ["011","104","David Park","38","Male","65000","2015-11-01"],
    ["012","105","Susan Chen","31","Female","54000","2017-02-15"],
    ["013","106","Brian Kim","45","Male","75000","2011-07-01"],
    ["014","107","Emily Lee","26","Female","46000","2019-01-01"],
    ["015","106","Michael Lee","37","Male","63000","2014-09-30"],
    ["016","107","Kelly Zhang","30","Female","49000","2018-04-01"],
    ["017","105","George Wang","34","Male","57000","2016-03-15"],
    ["018","104","Nancy Liu","29","","50000","2017-06-01"],
    ["019","103","Steven Chen","36","Male","62000","2015-08-01"],
    ["020","102","Grace Kim","32","Female","53000","2018-11-01"]
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

dept_data = [
    ["101", "Sales", "NYC", "US", "1000000"],
    ["102", "Marketing", "LA", "US", "900000"],
    ["103", "Finance", "London", "UK", "1200000"],
    ["104", "Engineering", "Beijing", "China", "1500000"],
    ["105", "Human Resources", "Tokyo", "Japan", "800000"],
    ["106", "Research and Development", "Perth", "Australia", "1100000"],
    ["107", "Customer Service", "Sydney", "Australia", "950000"]
]

dept_schema = "department_id string, department_name string, city string, country string, budget string"

In [0]:
emp_df = spark.createDataFrame(emp_data, schema =emp_schema)
dep_df = spark.createDataFrame(dept_data,schema = dept_schema)

In [0]:
emp_df.show()
dep_df.show()

+-----------+-------------+-------------+---+------+------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|
+-----------+-------------+-------------+---+------+------+----------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-07-01|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|
|        009|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|
|        010|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|
|        011|          104|   David Park| 38|  Male| 65000|2015-11-01|
|     

In [0]:
emp_df.printSchema()
dep_df.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)

root
 |-- department_id: string (nullable = true)
 |-- department_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- budget: string (nullable = true)



In [0]:
emp_df.rdd.getNumPartitions()


Out[6]: 8

In [0]:
dep_df.rdd.getNumPartitions()

Out[7]: 8

# Reduce or Increase Data Partitions

In [0]:
emp_repartitioned = emp_df.repartition(4)
emp_repartitioned.rdd.getNumPartitions()

Out[8]: 4

In [0]:
emp_repartitioned = emp_df.repartition(100)
emp_repartitioned.rdd.getNumPartitions()


Out[9]: 100

# For coalesce even if you mention 100 it will not increase partition size to 100 as below instead it decreases but in repartition increase or decrease can possible, data shuffling occurrs in repartition but it distributes data uniformly but in coalesce shuffling not happens but data uniformity not gurantee.

In [0]:
emp_repartitioned = emp_df.coalesce(100)
emp_repartitioned.rdd.getNumPartitions()


Out[10]: 8

In [0]:
# partition based on department i'd
emp_repartitioned = emp_df.repartition(4,"department_id")
emp_repartitioned.rdd.getNumPartitions()



Out[12]: 4

In [0]:
# to see partition like which department got which partition i'd we can visualise based on below command

emp_repartitioned_1 = emp_df.repartition(4,"department_id").withColumn("partition_id",spark_partition_id()).show()

+-----------+-------------+-------------+---+------+------+----------+------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|partition_id|
+-----------+-------------+-------------+---+------+------+----------+------------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|           3|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|           3|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|           3|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|           0|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|           0|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|           0|
|        020|          102|    Grace Kim| 32|Female| 53000|2018-11-01|           0|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|           3|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-07-01|      

## There is benefit when you do repartition and is one of the optimizatio technique

# JOINS

In [0]:
emp_join = emp_df.join(dep_df, emp_df["department_id"] == dep_df["department_id"], "inner").show()

+-----------+-------------+-------------+---+------+------+----------+-------------+--------------------+-------+---------+-------+
|employee_id|department_id|         name|age|gender|salary| hire_date|department_id|     department_name|   city|  country| budget|
+-----------+-------------+-------------+---+------+------+----------+-------------+--------------------+-------+---------+-------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|          101|               Sales|    NYC|       US|1000000|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|          101|               Sales|    NYC|       US|1000000|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|          101|               Sales|    NYC|       US|1000000|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|          102|           Marketing|     LA|       US| 900000|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|      

## Conditioning join

In [0]:
emp_final = emp_df.join(dep_df, (emp_df.department_id == dep_df.department_id)  & ((emp_df.department_id == 101) | (emp_df.department_id == 102)) , "left_outer").show()

+-----------+-------------+-------------+---+------+------+----------+-------------+---------------+----+-------+-------+
|employee_id|department_id|         name|age|gender|salary| hire_date|department_id|department_name|city|country| budget|
+-----------+-------------+-------------+---+------+------+----------+-------------+---------------+----+-------+-------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|          101|          Sales| NYC|     US|1000000|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|          101|          Sales| NYC|     US|1000000|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|          102|      Marketing|  LA|     US| 900000|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|          102|      Marketing|  LA|     US| 900000|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|         null|           null|null|   null|   null|
|        006|          1

In [0]:
# joining based on  whose salary is null noresult because everyone has salary...

emp_final = emp_df.join(dep_df, (emp_df.department_id == dep_df.department_id)  & ((emp_df.department_id == 101) | (emp_df.department_id == 102)) & (emp_df.salary.isNull()) , "left_outer").show()

+-----------+-------------+-------------+---+------+------+----------+-------------+---------------+----+-------+------+
|employee_id|department_id|         name|age|gender|salary| hire_date|department_id|department_name|city|country|budget|
+-----------+-------------+-------------+---+------+------+----------+-------------+---------------+----+-------+------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         null|           null|null|   null|  null|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|         null|           null|null|   null|  null|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|         null|           null|null|   null|  null|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|         null|           null|null|   null|  null|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|         null|           null|null|   null|  null|
|        006|          103|    J