In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

spark = SparkSession.builder.master("local[*]").appName("StringHandson").getOrCreate()


In [0]:
spark

In [0]:
emp_data = [
    ["000", "107", "Emily Lee", 26, "", 46000, "2019-01-01"],
    ["001", "101", "John Doe", 30, "Male", 50000, "2015-01-01"],
    ["002", "101", "Jane Smith", 25, "Female", 45000, "2016-02-15"],
    ["003", "102", "Bob Brown", 35, "Male", 55000, "2014-05-01"],
    ["004", "102", "Alice Lee", 28, "Female", 48000, "2017-09-30"],
    ["005", "103", "Jack Chan", 40, "Male", 60000, "2013-04-01"],
    ["006", "103", "Jill Wong", 32, "Female", 52000, "2018-07-01"],
    ["007", "101", "James Johnson", 42, "Male", 70000, "2012-03-15"],
    ["008", "102", "Kate Kim", 29, "Female", 51000, "2019-10-01"],
    ["009", "103", "Tom Tan", 33, "Male", 58000, "2016-06-01"],
    ["010", "104", "Lisa Lee", 27, "Female", 47000, "2018-08-01"],
    ["011", "104", "David Park", 38, "Male", 65000, "2015-11-01"],
    ["012", "105", "Susan Chen", 31, "Female", 54000, "2017-02-15"],
    ["013", "106", "Brian Kim", 45, "Male", 75000, "2011-07-01"],
    ["014", "107", "Emily Lee", 26, "Female", 46000, "2019-01-01"],
    ["015", "106", "Michael Lee", 37, "Male", 63000, "2014-09-30"],
    ["016", "107", "Kelly Zhang", 30, "Female", 49000, "2018-04-01"],
    ["017", "105", "George Wang", 34, "Male", 57000, "2016-03-15"],
    ["018", "104", "Nancy Liu", 29, "Female", 50000, "2017-06-01"],
    ["019", "103", "Steven Chen", 36, "Male", 62000, "2015-08-01"],
    ["020", "102", "Grace Kim", 32, "Female", 53000, "2018-11-01"],
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [0]:
emp = spark.createDataFrame(emp_data, schema = emp_schema)

In [0]:
emp.show()

+-----------+-------------+-------------+---+------+------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|
+-----------+-------------+-------------+---+------+------+----------+
|        000|          107|    Emily Lee| 26|      | 46000|2019-01-01|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-07-01|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|
|        009|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|
|        010|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|
|     

In [0]:
emp.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



## You can also use SQL query if you are not interested in writing DF APIs,
 ex as below query

In [0]:
#Need to add a column based on gender if gender is male keep M & if gender is female keep F

emp_gender_df = emp.withColumn("new_gender", expr("CASE WHEN gender == 'Male' then 'M' when gender == 'Female' then 'F' else NULL END"))

In [0]:
emp_gender_df.show()

+-----------+-------------+-------------+---+------+------+----------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|
+-----------+-------------+-------------+---+------+------+----------+----------+
|        000|          107|    Emily Lee| 26|      | 46000|2019-01-01|      null|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|         F|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|         M|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|         F|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|         M|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-07-01|         F|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|         M|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|         F|
|        009|   

In [0]:
# replacing a letter using regexp_replace method

emp_replace = emp_gender_df.withColumn("replaced_name", regexp_replace("name", 'J', 'K')).show()

+-----------+-------------+-------------+---+------+------+----------+----------+-------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|replaced_name|
+-----------+-------------+-------------+---+------+------+----------+----------+-------------+
|        000|          107|    Emily Lee| 26|      | 46000|2019-01-01|      null|    Emily Lee|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|     Kohn Doe|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|         F|   Kane Smith|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|         M|    Bob Brown|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|         F|    Alice Lee|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|         M|    Kack Chan|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-07-01|         F|    Kill Wong|
|        007|          101|James Johnson

# DATE FUNCTIONS

In [0]:
emp.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [0]:
# converting the hire_date data type to date data type using to_date method

emp_date = emp_gender_df.withColumn("hire_date", to_date("hire_date", 'yyyy-mm-dd'))

In [0]:
emp_date.show()

+-----------+-------------+-------------+---+------+------+----------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|
+-----------+-------------+-------------+---+------+------+----------+----------+
|        000|          107|    Emily Lee| 26|      | 46000|2019-01-01|      null|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-01-15|         F|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-01-01|         M|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-01-30|         F|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-01-01|         M|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-01-01|         F|
|        007|          101|James Johnson| 42|  Male| 70000|2012-01-15|         M|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-01-01|         F|
|        009|   

In [0]:
emp_date.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- new_gender: string (nullable = true)



# Current Date & Timestamp

In [0]:
emp_date_time = emp_date.withColumn("Date_now", current_date()).withColumn("Time_now", current_timestamp())

In [0]:
emp_date_time.show()
# by using truncate = false the o/p with will be shown correctly without compacting.
emp_date_time.show(truncate = False)

+-----------+-------------+-------------+---+------+------+----------+----------+----------+--------------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|  Date_now|            Time_now|
+-----------+-------------+-------------+---+------+------+----------+----------+----------+--------------------+
|        000|          107|    Emily Lee| 26|      | 46000|2019-01-01|      null|2025-04-04|2025-04-04 11:06:...|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|2025-04-04|2025-04-04 11:06:...|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-01-15|         F|2025-04-04|2025-04-04 11:06:...|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-01-01|         M|2025-04-04|2025-04-04 11:06:...|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-01-30|         F|2025-04-04|2025-04-04 11:06:...|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-01-01|         M|2025-04

# Drop Null records

In [0]:
emp_1 = emp_date_time.na.drop()

In [0]:
emp_1.show()

+-----------+-------------+-------------+---+------+------+----------+----------+----------+--------------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|  Date_now|            Time_now|
+-----------+-------------+-------------+---+------+------+----------+----------+----------+--------------------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|2025-04-04|2025-04-04 10:54:...|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-01-15|         F|2025-04-04|2025-04-04 10:54:...|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-01-01|         M|2025-04-04|2025-04-04 10:54:...|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-01-30|         F|2025-04-04|2025-04-04 10:54:...|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-01-01|         M|2025-04-04|2025-04-04 10:54:...|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-01-01|         F|2025-04

## In production scenarios dropping a null records is a bad habit so we have to replace a null with the other values

In [0]:
emp_2 = emp_date_time.withColumn("new_gender", coalesce("new_gender", lit('O')))

In [0]:
emp_2.show()

+-----------+-------------+-------------+---+------+------+----------+----------+----------+--------------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|  Date_now|            Time_now|
+-----------+-------------+-------------+---+------+------+----------+----------+----------+--------------------+
|        000|          107|    Emily Lee| 26|      | 46000|2019-01-01|         O|2025-04-04|2025-04-04 11:00:...|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|2025-04-04|2025-04-04 11:00:...|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-01-15|         F|2025-04-04|2025-04-04 11:00:...|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-01-01|         M|2025-04-04|2025-04-04 11:00:...|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-01-30|         F|2025-04-04|2025-04-04 11:00:...|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-01-01|         M|2025-04

In [0]:
#dropping old columns and renaming the columns

emp_final = emp_2.drop("gender").withColumnRenamed("new_gender","gender")

In [0]:
emp_final.show()

+-----------+-------------+-------------+---+------+----------+------+----------+--------------------+
|employee_id|department_id|         name|age|salary| hire_date|gender|  Date_now|            Time_now|
+-----------+-------------+-------------+---+------+----------+------+----------+--------------------+
|        000|          107|    Emily Lee| 26| 46000|2019-01-01|     O|2025-04-04|2025-04-04 11:06:...|
|        001|          101|     John Doe| 30| 50000|2015-01-01|     M|2025-04-04|2025-04-04 11:06:...|
|        002|          101|   Jane Smith| 25| 45000|2016-01-15|     F|2025-04-04|2025-04-04 11:06:...|
|        003|          102|    Bob Brown| 35| 55000|2014-01-01|     M|2025-04-04|2025-04-04 11:06:...|
|        004|          102|    Alice Lee| 28| 48000|2017-01-30|     F|2025-04-04|2025-04-04 11:06:...|
|        005|          103|    Jack Chan| 40| 60000|2013-01-01|     M|2025-04-04|2025-04-04 11:06:...|
|        006|          103|    Jill Wong| 32| 52000|2018-01-01|     F|202

In [0]:
emp_final.write.format("csv").save("dbfs:/FileStore/tables/Strings")