### Repartitions

In [0]:
# import SparkSession from spark SQL

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName('Basic Transformation - I')
    .getOrCreate()
)

In [0]:
spark

In [0]:
# emp data and schema

emp_data = [
    ['001', '101', 'Alice Johnson', '29', 'Female', '52000', '2016-03-15'],
    ['002', '102', 'Bob Smith', '34', 'Male', '48000', '2017-07-20'],
    ['003', '103', 'Charlie Brown', '26', 'Male', '47000', '2018-02-01'],
    ['004', '104', 'Diana Prince', '31', 'Female', '55000', '2015-11-12'],
    ['005', '101', 'Ethan Hunt', '28', 'Male', '49500', '2019-05-08'],
    ['006', '102', 'Fiona Gallagher', '27', 'Female', '51000', '2016-09-10'],
    ['007', '103', 'George Costanza', '33', 'Male', '46000', '2015-01-01'],
    ['008', '107', 'Hannah Montana', '25', 'Female', '53000', '2017-04-14'],
    ['009', '107', 'Isaac Newton', '35', 'Male', '60000', '2018-10-25'],
    ['010', '105', 'Jessica Jones', '32', 'Female', '58000', '2016-06-30'],
    ['010', '105', 'Jessica Jones', '32', 'Female', '58000', '2016-06-30'],
    ['011', '106', 'Kevin Malone', '36', 'Male', '49000', '2015-03-18'],
    ['012', '107', 'Laura Croft', '30', 'Female', '57000', '2017-08-22'],
    ['013', '101', 'Mike Ross', '29', 'Male', '51500', '2019-01-05'],
    ['014', '102', 'Nancy Drew', '27', 'Female', '54000', '2016-12-09'],
    ['015', '103', 'Oscar Martinez', '28', '', '45500', '2018-03-19'],
    ['016', '104', 'Pam Beesly', '31', 'Female', '47500', '2019-07-13'],
    ['017', '105', 'Quinn Fabray', '26', 'Female', '51000', '2015-11-05'],
    ['018', '106', 'Ryan Howard', '34', 'Male', '49000', '2016-04-21'],
    ['019', '107', 'Sophia Loren', '30', 'Female', '56000', '2018-09-17'],
    ['020', '104', 'Toby Flenderson', '35', 'Male', '45000', '2017-02-08'],
    ['020', '104', 'Toby Flenderson', '35', 'Male', '45000', '2017-02-08']
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

dept_data = [
    ['101','Sales','NYC','US','1000000'],
    ['102','Marketing','LA','US','900000'],
    ['103','Finance','London','UK','1200000'],
    ['104','Engineering','Beijing','China','1500000'],
    ['105','Human Resources','Tokyo','Jappan','800000'],
    ['106','Research and Development','Perth','Australia','1100000'],
    ['107','Customer Service','Sydney','Australia','950000'],
]

dept_schema = "department_id string, department_name string, city string, country string, budget string"

In [0]:
emp = spark.createDataFrame(data=emp_data, schema=emp_schema)
dept = spark.createDataFrame(data=dept_data, schema=dept_schema)

In [0]:
emp.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [0]:
dept.printSchema()

root
 |-- department_id: string (nullable = true)
 |-- department_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- budget: string (nullable = true)



In [0]:
emp.show()
dept.show()

+-----------+-------------+---------------+---+------+------+----------+
|employee_id|department_id|           name|age|gender|salary| hire_date|
+-----------+-------------+---------------+---+------+------+----------+
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|
|        003|          103|  Charlie Brown| 26|  Male| 47000|2018-02-01|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|
|        005|          101|     Ethan Hunt| 28|  Male| 49500|2019-05-08|
|        006|          102|Fiona Gallagher| 27|Female| 51000|2016-09-10|
|        007|          103|George Costanza| 33|  Male| 46000|2015-01-01|
|        008|          107| Hannah Montana| 25|Female| 53000|2017-04-14|
|        009|          107|   Isaac Newton| 35|  Male| 60000|2018-10-25|
|        010|          105|  Jessica Jones| 32|Female| 58000|2016-06-30|
|        010|          105|  Jessica Jones| 32|Fema

In [0]:
# get number of partitions for emp
emp.rdd.getNumPartitions()

8

In [0]:
# get number of partitions for dept
dept.rdd.getNumPartitions()

8

In [0]:
# Repartition of data using repartition & coalesce
emp_partitioned = emp.repartition(4) # adjust number of partitions as you require (increase or decrease)

In [0]:
emp_partitioned.rdd.getNumPartitions()

4

In [0]:
emp_partitioned = emp.coalesce(2) # can only decrease the partitions

In [0]:
emp_partitioned.rdd.getNumPartitions()

2

In [0]:
# Find the partition info for partitions and repartitions
from pyspark.sql.functions import spark_partition_id

emp_1 = emp.repartition(4, 'department_id').withColumn('partition_num', spark_partition_id())

In [0]:
emp_1.show()

+-----------+-------------+---------------+---+------+------+----------+-------------+
|employee_id|department_id|           name|age|gender|salary| hire_date|partition_num|
+-----------+-------------+---------------+---+------+------+----------+-------------+
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|            0|
|        006|          102|Fiona Gallagher| 27|Female| 51000|2016-09-10|            0|
|        008|          107| Hannah Montana| 25|Female| 53000|2017-04-14|            0|
|        009|          107|   Isaac Newton| 35|  Male| 60000|2018-10-25|            0|
|        012|          107|    Laura Croft| 30|Female| 57000|2017-08-22|            0|
|        014|          102|     Nancy Drew| 27|Female| 54000|2016-12-09|            0|
|        019|          107|   Sophia Loren| 30|Female| 56000|2018-09-17|            0|
|        010|          105|  Jessica Jones| 32|Female| 58000|2016-06-30|            1|
|        010|          105|  Jessica Jones|

### JOIN

In [0]:
# INNER JOIN datasets
# select e.emp_name, d.department_name, d.department_id, e.salary
# from emp e inner join dept d on emp.department_id = dept.department_id
# df_joined = emp.join(dept, how='inner', on=emp.department_id==dept.department_id)
df_joined = emp.alias("e").join(dept.alias("d"), how='inner', on=emp.department_id==dept.department_id)

In [0]:
# df_joined.select(emp.name, dept.department_id, dept.department_name, emp.salary).show(truncate=False)
df_joined.select("e.name", "d.department_id", "d.department_name", "e.salary").show(truncate=False)

+---------------+-------------+------------------------+------+
|name           |department_id|department_name         |salary|
+---------------+-------------+------------------------+------+
|Alice Johnson  |101          |Sales                   |52000 |
|Ethan Hunt     |101          |Sales                   |49500 |
|Mike Ross      |101          |Sales                   |51500 |
|Bob Smith      |102          |Marketing               |48000 |
|Fiona Gallagher|102          |Marketing               |51000 |
|Nancy Drew     |102          |Marketing               |54000 |
|Charlie Brown  |103          |Finance                 |47000 |
|George Costanza|103          |Finance                 |46000 |
|Oscar Martinez |103          |Finance                 |45500 |
|Diana Prince   |104          |Engineering             |55000 |
|Pam Beesly     |104          |Engineering             |47500 |
|Toby Flenderson|104          |Engineering             |45000 |
|Toby Flenderson|104          |Engineeri

In [0]:
# LEFT OUTER JOIN
# select e.emp_name, d.department_name, d.department_id, e.salary
# from emp e left outer join dept d on emp.department_id = dept.department_id
df_joined = emp.alias("e").join(dept.alias("d"), how='left_outer', on=emp.department_id==dept.department_id)

In [0]:
df_joined.show()

+-----------+-------------+---------------+---+------+------+----------+-------------+--------------------+-------+---------+-------+
|employee_id|department_id|           name|age|gender|salary| hire_date|department_id|     department_name|   city|  country| budget|
+-----------+-------------+---------------+---+------+------+----------+-------------+--------------------+-------+---------+-------+
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|          101|               Sales|    NYC|       US|1000000|
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|          102|           Marketing|     LA|       US| 900000|
|        003|          103|  Charlie Brown| 26|  Male| 47000|2018-02-01|          103|             Finance| London|       UK|1200000|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|          104|         Engineering|Beijing|    China|1500000|
|        005|          101|     Ethan Hunt| 28|  Male| 49500|2

In [0]:
df_joined.select("e.name", "d.department_id", "d.department_name", "e.salary").show(truncate=False)

+---------------+-------------+------------------------+------+
|name           |department_id|department_name         |salary|
+---------------+-------------+------------------------+------+
|Alice Johnson  |101          |Sales                   |52000 |
|Bob Smith      |102          |Marketing               |48000 |
|Charlie Brown  |103          |Finance                 |47000 |
|Diana Prince   |104          |Engineering             |55000 |
|Ethan Hunt     |101          |Sales                   |49500 |
|Fiona Gallagher|102          |Marketing               |51000 |
|George Costanza|103          |Finance                 |46000 |
|Hannah Montana |107          |Customer Service        |53000 |
|Isaac Newton   |107          |Customer Service        |60000 |
|Jessica Jones  |105          |Human Resources         |58000 |
|Jessica Jones  |105          |Human Resources         |58000 |
|Kevin Malone   |106          |Research and Development|49000 |
|Laura Croft    |107          |Customer 

In [0]:
# joins with cascading conditions
# join with department_id and only for departments 101 or 102
# join with not null/null conditions

df_final = emp.join(dept, how='left_outer',
                    on=(emp.department_id==dept.department_id) & ((emp.department_id=='101')|(emp.department_id=='102')) & (emp.salary.isNotNull())
                    )

# df_final = emp.join(dept, how='left_outer',
#                     on=(emp.department_id==dept.department_id) & ((emp.department_id=='101')|(emp.department_id=='102')) & (emp.salary.isNull())
#                     )

In [0]:
df_final.show(truncate=False)

+-----------+-------------+---------------+---+------+------+----------+-------------+---------------+----+-------+-------+
|employee_id|department_id|name           |age|gender|salary|hire_date |department_id|department_name|city|country|budget |
+-----------+-------------+---------------+---+------+------+----------+-------------+---------------+----+-------+-------+
|001        |101          |Alice Johnson  |29 |Female|52000 |2016-03-15|101          |Sales          |NYC |US     |1000000|
|002        |102          |Bob Smith      |34 |Male  |48000 |2017-07-20|102          |Marketing      |LA  |US     |900000 |
|003        |103          |Charlie Brown  |26 |Male  |47000 |2018-02-01|NULL         |NULL           |NULL|NULL   |NULL   |
|004        |104          |Diana Prince   |31 |Female|55000 |2015-11-12|NULL         |NULL           |NULL|NULL   |NULL   |
|005        |101          |Ethan Hunt     |28 |Male  |49500 |2019-05-08|101          |Sales          |NYC |US     |1000000|
|006    