In [0]:
# import SparkSession from spark SQL

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName('Basic Transformation - I')
    .getOrCreate()
)

In [0]:
spark

In [0]:
# emp data and schema

emp_data = [
    ['001', '101', 'Alice Johnson', '29', 'Female', '52000', '2016-03-15'],
    ['002', '102', 'Bob Smith', '34', 'Male', '48000', '2017-07-20'],
    ['003', '103', 'Charlie Brown', '26', 'Male', '47000', '2018-02-01'],
    ['004', '104', 'Diana Prince', '31', 'Female', '55000', '2015-11-12'],
    ['005', '105', 'Ethan Hunt', '28', 'Male', '49500', '2019-05-08'],
    ['006', '106', 'Fiona Gallagher', '27', 'Female', '51000', '2016-09-10'],
    ['007', '107', 'George Costanza', '33', 'Male', '46000', '2015-01-01'],
    ['008', '108', 'Hannah Montana', '25', 'Female', '53000', '2017-04-14'],
    ['009', '109', 'Isaac Newton', '35', 'Male', '60000', '2018-10-25'],
    ['010', '110', 'Jessica Jones', '32', 'Female', '58000', '2016-06-30'],
    ['011', '111', 'Kevin Malone', '36', 'Male', '49000', '2015-03-18'],
    ['012', '112', 'Laura Croft', '30', 'Female', '57000', '2017-08-22'],
    ['013', '113', 'Mike Ross', '29', 'Male', '51500', '2019-01-05'],
    ['014', '114', 'Nancy Drew', '27', 'Female', '54000', '2016-12-09'],
    ['015', '115', 'Oscar Martinez', '28', '', '45500', '2018-03-19'],
    ['016', '116', 'Pam Beesly', '31', 'Female', '47500', '2019-07-13'],
    ['017', '117', 'Quinn Fabray', '26', 'Female', '51000', '2015-11-05'],
    ['018', '118', 'Ryan Howard', '34', 'Male', '49000', '2016-04-21'],
    ['019', '119', 'Sophia Loren', '30', 'Female', '56000', '2018-09-17'],
    ['020', '120', 'Toby Flenderson', '35', 'Male', '45000', '2017-02-08']
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [0]:
emp = spark.createDataFrame(data=emp_data, schema=emp_schema)

In [0]:
emp.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [0]:
# Case when
# select employee_id, name, age, salary, gender,
# case when gender = 'Male' then 'M' when gender = 'Female' then 'F' else null end as new_gender, hire_date from emp

from pyspark.sql.functions import when, col, expr

emp_gender_fixed = emp.withColumn('new_gender', when(col('gender') == 'Male', 'M')
                                  .when(col('gender') == 'Female', 'F')
                                  .otherwise(None)
                                  )

# Case when using expr
emp_gender_fixed_1 = emp.withColumn('new_gender', expr("case when gender = 'Male' then 'M' when gender = 'Female' then 'F' else null end"))

In [0]:
emp_gender_fixed.show()

+-----------+-------------+---------------+---+------+------+----------+----------+
|employee_id|department_id|           name|age|gender|salary| hire_date|new_gender|
+-----------+-------------+---------------+---+------+------+----------+----------+
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|         F|
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|         M|
|        003|          103|  Charlie Brown| 26|  Male| 47000|2018-02-01|         M|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|         F|
|        005|          105|     Ethan Hunt| 28|  Male| 49500|2019-05-08|         M|
|        006|          106|Fiona Gallagher| 27|Female| 51000|2016-09-10|         F|
|        007|          107|George Costanza| 33|  Male| 46000|2015-01-01|         M|
|        008|          108| Hannah Montana| 25|Female| 53000|2017-04-14|         F|
|        009|          109|   Isaac Newton| 35|  Male| 60000|2018-10-25|    

In [0]:
emp_gender_fixed_1.show()

+-----------+-------------+---------------+---+------+------+----------+----------+
|employee_id|department_id|           name|age|gender|salary| hire_date|new_gender|
+-----------+-------------+---------------+---+------+------+----------+----------+
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|         F|
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|         M|
|        003|          103|  Charlie Brown| 26|  Male| 47000|2018-02-01|         M|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|         F|
|        005|          105|     Ethan Hunt| 28|  Male| 49500|2019-05-08|         M|
|        006|          106|Fiona Gallagher| 27|Female| 51000|2016-09-10|         F|
|        007|          107|George Costanza| 33|  Male| 46000|2015-01-01|         M|
|        008|          108| Hannah Montana| 25|Female| 53000|2017-04-14|         F|
|        009|          109|   Isaac Newton| 35|  Male| 60000|2018-10-25|    

In [0]:
# Replace in Strings
# select employee_id, name, replace(name, 'J', 'Z') as new_name, age, salary, gender, new_gender, hire_date from emp_gender_fixed

from pyspark.sql.functions import regexp_replace

emp_name_fixed = emp_gender_fixed.withColumn('new_name', regexp_replace(col('name'),'J','Z'))

#### String functions:
##### [https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#string-functions](url)

In [0]:
emp_name_fixed.where(col('name').like("%J%") & col("new_name").like("%Z%")).show()

+-----------+-------------+-------------+---+------+------+----------+----------+-------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|     new_name|
+-----------+-------------+-------------+---+------+------+----------+----------+-------------+
|        001|          101|Alice Johnson| 29|Female| 52000|2016-03-15|         F|Alice Zohnson|
|        010|          110|Jessica Jones| 32|Female| 58000|2016-06-30|         F|Zessica Zones|
+-----------+-------------+-------------+---+------+------+----------+----------+-------------+



In [0]:
# Convert Date
# select *, to_date(hire_date, 'YYYY-MM-DD') as hire_date from emp_name_fixed # convert and overwrite

from pyspark.sql.functions import to_date

emp_date_fixed = emp_name_fixed.withColumn("hire_date", to_date(col("hire_date"), "yyyy-MM-dd"))

In [0]:
emp_date_fixed.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- new_gender: string (nullable = true)
 |-- new_name: string (nullable = true)



In [0]:
from pyspark.sql.functions import current_date, current_timestamp

emp_dated = emp_date_fixed.withColumn("date_now", current_date()).withColumn("timestamp_now", current_timestamp())

In [0]:
emp_dated.limit(5).show(truncate=False)

+-----------+-------------+-------------+---+------+------+----------+----------+-------------+----------+-----------------------+
|employee_id|department_id|name         |age|gender|salary|hire_date |new_gender|new_name     |date_now  |timestamp_now          |
+-----------+-------------+-------------+---+------+------+----------+----------+-------------+----------+-----------------------+
|001        |101          |Alice Johnson|29 |Female|52000 |2016-03-15|F         |Alice Zohnson|2024-12-09|2024-12-09 08:00:17.509|
|002        |102          |Bob Smith    |34 |Male  |48000 |2017-07-20|M         |Bob Smith    |2024-12-09|2024-12-09 08:00:17.509|
|003        |103          |Charlie Brown|26 |Male  |47000 |2018-02-01|M         |Charlie Brown|2024-12-09|2024-12-09 08:00:17.509|
|004        |104          |Diana Prince |31 |Female|55000 |2015-11-12|F         |Diana Prince |2024-12-09|2024-12-09 08:00:17.509|
|005        |105          |Ethan Hunt   |28 |Male  |49500 |2019-05-08|M         |Et

In [0]:
emp_dated.show(truncate=False)

+-----------+-------------+---------------+---+------+------+----------+----------+---------------+----------+-----------------------+
|employee_id|department_id|name           |age|gender|salary|hire_date |new_gender|new_name       |date_now  |timestamp_now          |
+-----------+-------------+---------------+---+------+------+----------+----------+---------------+----------+-----------------------+
|001        |101          |Alice Johnson  |29 |Female|52000 |2016-03-15|F         |Alice Zohnson  |2024-12-09|2024-12-09 08:00:18.633|
|002        |102          |Bob Smith      |34 |Male  |48000 |2017-07-20|M         |Bob Smith      |2024-12-09|2024-12-09 08:00:18.633|
|003        |103          |Charlie Brown  |26 |Male  |47000 |2018-02-01|M         |Charlie Brown  |2024-12-09|2024-12-09 08:00:18.633|
|004        |104          |Diana Prince   |31 |Female|55000 |2015-11-12|F         |Diana Prince   |2024-12-09|2024-12-09 08:00:18.633|
|005        |105          |Ethan Hunt     |28 |Male  |4

In [0]:
# Drop null

emp_1 = emp_dated.na.drop()

In [0]:
emp_1.show(truncate=False)

+-----------+-------------+---------------+---+------+------+----------+----------+---------------+----------+-----------------------+
|employee_id|department_id|name           |age|gender|salary|hire_date |new_gender|new_name       |date_now  |timestamp_now          |
+-----------+-------------+---------------+---+------+------+----------+----------+---------------+----------+-----------------------+
|001        |101          |Alice Johnson  |29 |Female|52000 |2016-03-15|F         |Alice Zohnson  |2024-12-09|2024-12-09 08:00:19.599|
|002        |102          |Bob Smith      |34 |Male  |48000 |2017-07-20|M         |Bob Smith      |2024-12-09|2024-12-09 08:00:19.599|
|003        |103          |Charlie Brown  |26 |Male  |47000 |2018-02-01|M         |Charlie Brown  |2024-12-09|2024-12-09 08:00:19.599|
|004        |104          |Diana Prince   |31 |Female|55000 |2015-11-12|F         |Diana Prince   |2024-12-09|2024-12-09 08:00:19.599|
|005        |105          |Ethan Hunt     |28 |Male  |4

In [0]:
# Fix null values
# select *, nvl('new_gender', 'NA') as new_gender from emp_dated

from pyspark.sql.functions import coalesce, lit

emp_null_df = emp_dated.withColumn("new_gender", coalesce(col("new_gender"), lit("NA")))

In [0]:
emp_null_df.show()

+-----------+-------------+---------------+---+------+------+----------+----------+---------------+----------+--------------------+
|employee_id|department_id|           name|age|gender|salary| hire_date|new_gender|       new_name|  date_now|       timestamp_now|
+-----------+-------------+---------------+---+------+------+----------+----------+---------------+----------+--------------------+
|        001|          101|  Alice Johnson| 29|Female| 52000|2016-03-15|         F|  Alice Zohnson|2024-12-09|2024-12-09 08:00:...|
|        002|          102|      Bob Smith| 34|  Male| 48000|2017-07-20|         M|      Bob Smith|2024-12-09|2024-12-09 08:00:...|
|        003|          103|  Charlie Brown| 26|  Male| 47000|2018-02-01|         M|  Charlie Brown|2024-12-09|2024-12-09 08:00:...|
|        004|          104|   Diana Prince| 31|Female| 55000|2015-11-12|         F|   Diana Prince|2024-12-09|2024-12-09 08:00:...|
|        005|          105|     Ethan Hunt| 28|  Male| 49500|2019-05-08|    

In [0]:
# Drop old columns and Fix new column names
emp_final = emp_null_df.drop("name", "gender").withColumnRenamed("new_name", "name").withColumnRenamed("new_gender", "gender")

In [0]:
emp_final.show(5)

+-----------+-------------+---+------+----------+------+-------------+----------+--------------------+
|employee_id|department_id|age|salary| hire_date|gender|         name|  date_now|       timestamp_now|
+-----------+-------------+---+------+----------+------+-------------+----------+--------------------+
|        001|          101| 29| 52000|2016-03-15|     F|Alice Zohnson|2024-12-09|2024-12-09 08:00:...|
|        002|          102| 34| 48000|2017-07-20|     M|    Bob Smith|2024-12-09|2024-12-09 08:00:...|
|        003|          103| 26| 47000|2018-02-01|     M|Charlie Brown|2024-12-09|2024-12-09 08:00:...|
|        004|          104| 31| 55000|2015-11-12|     F| Diana Prince|2024-12-09|2024-12-09 08:00:...|
|        005|          105| 28| 49500|2019-05-08|     M|   Ethan Hunt|2024-12-09|2024-12-09 08:00:...|
+-----------+-------------+---+------+----------+------+-------------+----------+--------------------+
only showing top 5 rows



In [0]:
# Convert date into string and extract date information

from pyspark.sql.functions import date_format

emp_fixed = emp_final.withColumn('date_string', date_format(col("hire_date"), 'dd/MM/yyyy'))

emp_fixed_1 = emp_final.withColumn('date_year', date_format(col("hire_date"), 'yyyy'))

In [0]:
emp_fixed_1.show(5)

+-----------+-------------+---+------+----------+------+-------------+----------+--------------------+---------+
|employee_id|department_id|age|salary| hire_date|gender|         name|  date_now|       timestamp_now|date_year|
+-----------+-------------+---+------+----------+------+-------------+----------+--------------------+---------+
|        001|          101| 29| 52000|2016-03-15|     F|Alice Zohnson|2024-12-09|2024-12-09 08:00:...|     2016|
|        002|          102| 34| 48000|2017-07-20|     M|    Bob Smith|2024-12-09|2024-12-09 08:00:...|     2017|
|        003|          103| 26| 47000|2018-02-01|     M|Charlie Brown|2024-12-09|2024-12-09 08:00:...|     2018|
|        004|          104| 31| 55000|2015-11-12|     F| Diana Prince|2024-12-09|2024-12-09 08:00:...|     2015|
|        005|          105| 28| 49500|2019-05-08|     M|   Ethan Hunt|2024-12-09|2024-12-09 08:00:...|     2019|
+-----------+-------------+---+------+----------+------+-------------+----------+---------------