**a structured set of notes with code to cover changing data types, filtering data, and handling unique/distinct values in PySpark using the employee data:**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive

# Mount Google Drive with a longer timeout
drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
employeeSechema = StructType([
    StructField("ID",IntegerType() ,True),
    StructField("Name",StringType() ,True),
    StructField("Age",IntegerType() ,True),
    StructField("Salary",FloatType() ,True),
    StructField("Joining_Date",DateType() ,True),
    StructField("Department",StringType() ,True),
    StructField("Performance_Rating",IntegerType() ,True),
    StructField("Email",StringType() ,True),
    StructField("Address",StringType() ,True),
    StructField("Phone",StringType() ,True)

])
# Load the DataFrame with the defined schema
#df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
df.printSchema()
df.show(50)

Mounted at /content/drive
root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: float (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Department: string (nullable = true)
 |-- Performance_Rating: integer (nullable = true)
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Phone: string (nullable = true)

+---+-------------+----+-------+------------+-----------+------------------+--------------------+------------------+------------+
| ID|         Name| Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|       Phone|
+---+-------------+----+-------+------------+-----------+------------------+--------------------+------------------+------------+
|  1|Alice Johnson|  29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|123-456-7890|
|  2|    Bob Smith|  35|85000.0|  2020-08-20|  Marketing

### Create NewSpark Session

In [4]:
spark = SparkSession.builder.appName('StrongAndStringFunction').getOrCreate()

In [6]:
#Sample Data
data = [("USA", "North America", 100, 50.5), ("India", "Asia", 300, 20.0), ("Germany", "Europe", 200, 30.5), ("Australia", "Oceania", 150, 60.0), ("Japan", "Asia", 120, 45.0), ("Brazil", "South America", 180, 25.0) ]

# Define Schema
columns = ["Country", "Region", "UnitsSold", "UnitPrice"]

#Create Dataframe
df = spark.createDataFrame(data=data, schema=columns)

# Display Orifinal Data Frame
df.show()

+---------+-------------+---------+---------+
|  Country|       Region|UnitsSold|UnitPrice|
+---------+-------------+---------+---------+
|      USA|North America|      100|     50.5|
|    India|         Asia|      300|     20.0|
|  Germany|       Europe|      200|     30.5|
|Australia|      Oceania|      150|     60.0|
|    Japan|         Asia|      120|     45.0|
|   Brazil|South America|      180|     25.0|
+---------+-------------+---------+---------+



## 1. Sort by a single column (ascending order):

In [8]:
df.orderBy('Country').show()

+---------+-------------+---------+---------+
|  Country|       Region|UnitsSold|UnitPrice|
+---------+-------------+---------+---------+
|Australia|      Oceania|      150|     60.0|
|   Brazil|South America|      180|     25.0|
|  Germany|       Europe|      200|     30.5|
|    India|         Asia|      300|     20.0|
|    Japan|         Asia|      120|     45.0|
|      USA|North America|      100|     50.5|
+---------+-------------+---------+---------+



## 2.Sort by multiple columns:

In [9]:
df.orderBy("Country", "UnitsSold").show()

+---------+-------------+---------+---------+
|  Country|       Region|UnitsSold|UnitPrice|
+---------+-------------+---------+---------+
|Australia|      Oceania|      150|     60.0|
|   Brazil|South America|      180|     25.0|
|  Germany|       Europe|      200|     30.5|
|    India|         Asia|      300|     20.0|
|    Japan|         Asia|      120|     45.0|
|      USA|North America|      100|     50.5|
+---------+-------------+---------+---------+



## 3.Sort by a column in descending order and limit:

In [10]:
sorted_df = df.orderBy(desc('country')).show(3)

+-------+-------------+---------+---------+
|Country|       Region|UnitsSold|UnitPrice|
+-------+-------------+---------+---------+
|    USA|North America|      100|     50.5|
|  Japan|         Asia|      120|     45.0|
|  India|         Asia|      300|     20.0|
+-------+-------------+---------+---------+
only showing top 3 rows



## 4.Sorting with null values last:

In [11]:
sorted_df = df.orderBy(col('Country').desc(), nulls_last=True).show()
#Note: This ensures that null values (if present) are placed at the end when sorting by Country.

+---------+-------------+---------+---------+
|  Country|       Region|UnitsSold|UnitPrice|
+---------+-------------+---------+---------+
|      USA|North America|      100|     50.5|
|    Japan|         Asia|      120|     45.0|
|    India|         Asia|      300|     20.0|
|  Germany|       Europe|      200|     30.5|
|   Brazil|South America|      180|     25.0|
|Australia|      Oceania|      150|     60.0|
+---------+-------------+---------+---------+

