In [0]:
# import SparkSession from spark SQL

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName('Basic Transformation - I')
    .getOrCreate()
)

In [0]:
spark

In [0]:
# emp data and schema

emp_data_1 = [
    ['001', '101', 'Alice Johnson', '29', 'Female', '52000', '2016-03-15'],
    ['002', '102', 'Bob Smith', '34', 'Male', '48000', '2017-07-20'],
    ['003', '103', 'Charlie Brown', '26', 'Male', '47000', '2018-02-01'],
    ['004', '104', 'Diana Prince', '31', 'Female', '55000', '2015-11-12'],
    ['005', '105', 'Ethan Hunt', '28', 'Male', '49500', '2019-05-08'],
    ['006', '106', 'Fiona Gallagher', '27', 'Female', '51000', '2016-09-10'],
    ['007', '107', 'George Costanza', '33', 'Male', '46000', '2015-01-01'],
    ['008', '108', 'Hannah Montana', '25', 'Female', '53000', '2017-04-14'],
    ['009', '109', 'Isaac Newton', '35', 'Male', '60000', '2018-10-25'],
    ['010', '110', 'Jessica Jones', '32', 'Female', '58000', '2016-06-30'],
]

emp_data_2 = [
    ['011', '111', 'Kevin Malone', '36', 'Male', '49000', '2015-03-18'],
    ['012', '112', 'Laura Croft', '30', 'Female', '57000', '2017-08-22'],
    ['013', '113', 'Mike Ross', '29', 'Male', '51500', '2019-01-05'],
    ['014', '114', 'Nancy Drew', '27', 'Female', '54000', '2016-12-09'],
    ['015', '115', 'Oscar Martinez', '28', '', '45500', '2018-03-19'],
    ['016', '116', 'Pam Beesly', '31', 'Female', '47500', '2019-07-13'],
    ['017', '117', 'Quinn Fabray', '26', 'Female', '51000', '2015-11-05'],
    ['018', '118', 'Ryan Howard', '34', 'Male', '49000', '2016-04-21'],
    ['019', '119', 'Sophia Loren', '30', 'Female', '56000', '2018-09-17'],
    ['020', '120', 'Toby Flenderson', '35', 'Male', '45000', '2017-02-08']
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [0]:
emp_1 = spark.createDataFrame(data=emp_data_1, schema=emp_schema)
emp_2 = spark.createDataFrame(data=emp_data_2, schema=emp_schema)

In [0]:
emp_1.show()

+-----------+-------------+---------------+---+------+------+----------+
|employee_id|department_id|           name|age|gender|salary| hire_date|
+-----------+-------------+---------------+---+------+------+----------+
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|
|        003|          103|  Charlie Brown| 26|  Male| 47000|2018-02-01|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|
|        005|          105|     Ethan Hunt| 28|  Male| 49500|2019-05-08|
|        006|          106|Fiona Gallagher| 27|Female| 51000|2016-09-10|
|        007|          107|George Costanza| 33|  Male| 46000|2015-01-01|
|        008|          108| Hannah Montana| 25|Female| 53000|2017-04-14|
|        009|          109|   Isaac Newton| 35|  Male| 60000|2018-10-25|
|        010|          110|  Jessica Jones| 32|Female| 58000|2016-06-30|
+-----------+-------------+---------------+---+----

In [0]:
emp_1.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [0]:
emp_2.show()

+-----------+-------------+---------------+---+------+------+----------+
|employee_id|department_id|           name|age|gender|salary| hire_date|
+-----------+-------------+---------------+---+------+------+----------+
|        011|          111|   Kevin Malone| 36|  Male| 49000|2015-03-18|
|        012|          112|    Laura Croft| 30|Female| 57000|2017-08-22|
|        013|          113|      Mike Ross| 29|  Male| 51500|2019-01-05|
|        014|          114|     Nancy Drew| 27|Female| 54000|2016-12-09|
|        015|          115| Oscar Martinez| 28|      | 45500|2018-03-19|
|        016|          116|     Pam Beesly| 31|Female| 47500|2019-07-13|
|        017|          117|   Quinn Fabray| 26|Female| 51000|2015-11-05|
|        018|          118|    Ryan Howard| 34|  Male| 49000|2016-04-21|
|        019|          119|   Sophia Loren| 30|Female| 56000|2018-09-17|
|        020|          120|Toby Flenderson| 35|  Male| 45000|2017-02-08|
+-----------+-------------+---------------+---+----

In [0]:
emp_2.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [0]:
# Union and Union All
# select * from emp_1 union select * from emp_2

emp = emp_1.union(emp_2) # exclude duplication
# emp = emp_1.unionAll(emp_2) ## include duplication

In [0]:
emp.show()

+-----------+-------------+---------------+---+------+------+----------+
|employee_id|department_id|           name|age|gender|salary| hire_date|
+-----------+-------------+---------------+---+------+------+----------+
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|
|        003|          103|  Charlie Brown| 26|  Male| 47000|2018-02-01|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|
|        005|          105|     Ethan Hunt| 28|  Male| 49500|2019-05-08|
|        006|          106|Fiona Gallagher| 27|Female| 51000|2016-09-10|
|        007|          107|George Costanza| 33|  Male| 46000|2015-01-01|
|        008|          108| Hannah Montana| 25|Female| 53000|2017-04-14|
|        009|          109|   Isaac Newton| 35|  Male| 60000|2018-10-25|
|        010|          110|  Jessica Jones| 32|Female| 58000|2016-06-30|
|        011|          111|   Kevin Malone| 36|  Ma

In [0]:
# sort the emp data based on desc salary
# select * from emp order by salary desc

from pyspark.sql.functions import desc, asc, col

emp_sorted = emp.orderBy(col('salary').desc())
# emp_sorted = emp.orderBy(col('salary').asc()) # ascending

In [0]:
emp_sorted.show()

+-----------+-------------+---------------+---+------+------+----------+
|employee_id|department_id|           name|age|gender|salary| hire_date|
+-----------+-------------+---------------+---+------+------+----------+
|        009|          109|   Isaac Newton| 35|  Male| 60000|2018-10-25|
|        010|          110|  Jessica Jones| 32|Female| 58000|2016-06-30|
|        012|          112|    Laura Croft| 30|Female| 57000|2017-08-22|
|        019|          119|   Sophia Loren| 30|Female| 56000|2018-09-17|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|
|        014|          114|     Nancy Drew| 27|Female| 54000|2016-12-09|
|        008|          108| Hannah Montana| 25|Female| 53000|2017-04-14|
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|
|        013|          113|      Mike Ross| 29|  Male| 51500|2019-01-05|
|        017|          117|   Quinn Fabray| 26|Female| 51000|2015-11-05|
|        006|          106|Fiona Gallagher| 27|Fema

In [0]:
# aggregation
# select dept_id, count(employee_id) as total_dept_count from emp_sorted group by dept_id
from pyspark.sql.functions import count

emp_count = emp.withColumnRenamed("department_id", "dept_id").groupBy('dept_id').agg(count('employee_id').alias('total_dept_count'))

In [0]:
emp_count.show(5)

+-------+----------------+
|dept_id|total_dept_count|
+-------+----------------+
|    101|               1|
|    102|               1|
|    103|               1|
|    104|               1|
|    105|               1|
+-------+----------------+
only showing top 5 rows



In [0]:
# Aggregation
# select dept_id, sum(salary) as total_dept_salary from emp_sorted group by dept_id
from pyspark.sql.functions import sum

emp_sum = emp.withColumnRenamed("department_id", "dept_id").groupBy('dept_id').agg(sum('salary').alias('total_dept_salary'))

In [0]:
emp_sum.show(5)

+-------+-----------------+
|dept_id|total_dept_salary|
+-------+-----------------+
|    101|          52000.0|
|    102|          48000.0|
|    103|          47000.0|
|    104|          55000.0|
|    105|          49500.0|
+-------+-----------------+
only showing top 5 rows



In [0]:
# Aggregation with HAVING clause
# select dept_id, avg(salary) as avg_dept_salary from emp_sorted group by dept_id having avg(salary) > 50000
from pyspark.sql.functions import avg

emp_avg = emp_sorted.withColumnRenamed("department_id","dept_id").groupBy("dept_id").agg(avg('salary').alias('avg_dept_salary')).where('avg_dept_salary > 50000')

In [0]:
emp_avg.show(5)

+-------+---------------+
|dept_id|avg_dept_salary|
+-------+---------------+
|    101|        52000.0|
|    112|        57000.0|
|    113|        51500.0|
|    110|        58000.0|
|    104|        55000.0|
+-------+---------------+
only showing top 5 rows



In [0]:
# UNION by name
# In case of the column sequence is different

emp_2_other = emp_2.select('employee_id', 'salary', 'department_id', 'name', 'hire_date','gender','age')

emp_1.printSchema()
emp_2_other.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)

root
 |-- employee_id: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- hire_date: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)



In [0]:
emp_fixed = emp_1.unionByName(emp_2_other)

In [0]:
emp_fixed.show()

+-----------+-------------+---------------+---+------+------+----------+
|employee_id|department_id|           name|age|gender|salary| hire_date|
+-----------+-------------+---------------+---+------+------+----------+
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|
|        003|          103|  Charlie Brown| 26|  Male| 47000|2018-02-01|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|
|        005|          105|     Ethan Hunt| 28|  Male| 49500|2019-05-08|
|        006|          106|Fiona Gallagher| 27|Female| 51000|2016-09-10|
|        007|          107|George Costanza| 33|  Male| 46000|2015-01-01|
|        008|          108| Hannah Montana| 25|Female| 53000|2017-04-14|
|        009|          109|   Isaac Newton| 35|  Male| 60000|2018-10-25|
|        010|          110|  Jessica Jones| 32|Female| 58000|2016-06-30|
|        011|          111|   Kevin Malone| 36|  Ma