**a structured set of notes with code to cover changing data types, filtering data, and handling unique/distinct values in PySpark using the employee data:**

In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive

# Mount Google Drive with a longer timeout
drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
employeeSechema = StructType([
    StructField("ID",IntegerType() ,True),
    StructField("Name",StringType() ,True),
    StructField("Age",IntegerType() ,True),
    StructField("Salary",FloatType() ,True),
    StructField("Joining_Date",DateType() ,True),
    StructField("Department",StringType() ,True),
    StructField("Performance_Rating",IntegerType() ,True),
    StructField("Email",StringType() ,True),
    StructField("Address",StringType() ,True),
    StructField("Phone",StringType() ,True)

])
# Load the DataFrame with the defined schema
#df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
df.printSchema()
df.show(50)

Mounted at /content/drive
root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: float (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Department: string (nullable = true)
 |-- Performance_Rating: integer (nullable = true)
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Phone: string (nullable = true)

+---+-------------+----+-------+------------+-----------+------------------+--------------------+------------------+------------+
| ID|         Name| Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|       Phone|
+---+-------------+----+-------+------------+-----------+------------------+--------------------+------------------+------------+
|  1|Alice Johnson|  29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|123-456-7890|
|  2|    Bob Smith|  35|85000.0|  2020-08-20|  Marketing

**Changing Data Types (Schema Transformation)**

In [35]:
#Change the salary column from float to double
newDf = df.withColumn("Salary", col("Salary").cast("double"))
newDf = newDf.withColumn("Phone", regexp_replace(col("Phone"), "-", "").cast("long")) #Remove non-numeric characters


newDf.printSchema()
newDf.show()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: double (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Department: string (nullable = true)
 |-- Performance_Rating: integer (nullable = true)
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Phone: long (nullable = true)

+---+-------------+----+-------+------------+-----------+------------------+--------------------+------------------+----------+
| ID|         Name| Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|     Phone|
+---+-------------+----+-------+------------+-----------+------------------+--------------------+------------------+----------+
|  1|Alice Johnson|  29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|1234567890|
|  2|    Bob Smith|  35|85000.0|  2020-08-20|  Marketing|                 5|bob.smith@examp

### Filtering Data

In [48]:
# Filter rows where Salary is greater than 80,000
filter_df = newDf.filter(col("Salary")> 80000)
filter_df.show()

#Filtering rows where Age is not null
filter_df_2 = newDf.filter(newDf["Age"].isNotNull())
filter_df_2.show()

#Filtering rows where Age is not null
filter_df_2 = newDf.filter(newDf["Age"].isNull())
filter_df_2.show()

+---+-----------+---+-------+------------+----------+------------------+--------------------+------------------+----------+
| ID|       Name|Age| Salary|Joining_Date|Department|Performance_Rating|               Email|           Address|     Phone|
+---+-----------+---+-------+------------+----------+------------------+--------------------+------------------+----------+
|  2|  Bob Smith| 35|85000.0|  2020-08-20| Marketing|                 5|bob.smith@example...|456 Oak Avenue, LA|9876543210|
|  3|Cathy Brown| 42|95000.0|  2019-11-10|   Finance|                 3|cathy.brown@examp...| 789 Pine Road, TX|5678901234|
+---+-----------+---+-------+------------+----------+------------------+--------------------+------------------+----------+

+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+----------+
| ID|         Name|Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|     Phone|
+

### 3. Multiple Filters (Chaining Conditions)

In [40]:
#Filter rows where Age > 25 and Department is 'Engineering'

multiFilter_df = newDf.filter((newDf["Age"]>25) &(newDf["Department"]=="Engineering"))
multiFilter_df.show()

+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+----------+
| ID|         Name|Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|     Phone|
+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+----------+
|  1|Alice Johnson| 29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|1234567890|
|  5|    Eva Green| 28|72000.0|  2023-06-22|Engineering|                 5|eva.green@example...|321 Birch Blvd, FL|2345678901|
|  8|      Charlie| 27|21000.0|  2024-11-07|Engineering|                 8|     charlie@xyz.com|              Null|3456789014|
+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+----------+



### 4. Filtering on Null or Non-Null Values

In [51]:
# Filter rows where 'Address' is NULL
newDf.show()
filter_null_df = newDf.filter(newDf["Address"].isNull())
filter_null_df.show()




+---+-------------+----+-------+------------+-----------+------------------+--------------------+------------------+----------+
| ID|         Name| Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|     Phone|
+---+-------------+----+-------+------------+-----------+------------------+--------------------+------------------+----------+
|  1|Alice Johnson|  29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|1234567890|
|  2|    Bob Smith|  35|85000.0|  2020-08-20|  Marketing|                 5|bob.smith@example...|456 Oak Avenue, LA|9876543210|
|  3|  Cathy Brown|  42|95000.0|  2019-11-10|    Finance|                 3|cathy.brown@examp...| 789 Pine Road, TX|5678901234|
|  4|  David White|  30|67000.0|  2022-01-05|         HR|                 4|david.white@examp...|234 Maple Lane, IL|3456789012|
|  5|    Eva Green|  28|72000.0|  2023-06-22|Engineering|                 5|eva.green@example...|321 Bir