# PySpark Transformations & Actions

---

#### Import necessary libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, count, row_number
from pyspark.sql.window import Window

#### Create SparkSession

In [2]:
spark = SparkSession.builder.appName("TransformationsExample").getOrCreate()

#### Sample DataFrames

In [3]:
data1 = [
    (1, "Alice", "HR", 3000),
    (2, "Bob", "IT", 4000),
    (3, "Charlie", "HR", 3500),
    (4, "David", "IT", 4500),
    (5, "Eve", "Finance", 5000)
]
columns1 = ["id", "name", "dept", "salary"]

In [4]:
data2 = [
    (1, "Chennai"),
    (2, "Bangalore"),
    (3, "Delhi"),
    (6, "Mumbai")
]
columns2 = ["id", "city"]

In [5]:
df1 = spark.createDataFrame(data1, columns1)

In [6]:
df2 = spark.createDataFrame(data2, columns2)

1. Filter Transformation

In [7]:
print("Filter: Employees with salary > 4000")
df1.filter(col("salary") > 4000).show()

Filter: Employees with salary > 4000
+---+-----+-------+------+
| id| name|   dept|salary|
+---+-----+-------+------+
|  4|David|     IT|  4500|
|  5|  Eve|Finance|  5000|
+---+-----+-------+------+



2. Join Transformation

In [8]:
print("Join: Left join with employee location")
df_join = df1.join(df2, on="id", how="left")
df_join.show()

Join: Left join with employee location
+---+-------+-------+------+---------+
| id|   name|   dept|salary|     city|
+---+-------+-------+------+---------+
|  1|  Alice|     HR|  3000|  Chennai|
|  2|    Bob|     IT|  4000|Bangalore|
|  5|    Eve|Finance|  5000|     NULL|
|  3|Charlie|     HR|  3500|    Delhi|
|  4|  David|     IT|  4500|     NULL|
+---+-------+-------+------+---------+



3. GroupBy and Aggregation

In [9]:
print("GroupBy: Total and Average Salary by Department")
df1.groupBy("dept").agg(
    sum("salary").alias("total_salary"),
    avg("salary").alias("avg_salary")
).show()

GroupBy: Total and Average Salary by Department
+-------+------------+----------+
|   dept|total_salary|avg_salary|
+-------+------------+----------+
|     HR|        6500|    3250.0|
|     IT|        8500|    4250.0|
|Finance|        5000|    5000.0|
+-------+------------+----------+



4. Window Functions

In [10]:
print("Window Function: Row number by department ordered by salary descending")
windowSpec = Window.partitionBy("dept").orderBy(col("salary").desc())

df_window = df1.withColumn("row_number", row_number().over(windowSpec))
df_window.show()

Window Function: Row number by department ordered by salary descending
+---+-------+-------+------+----------+
| id|   name|   dept|salary|row_number|
+---+-------+-------+------+----------+
|  5|    Eve|Finance|  5000|         1|
|  3|Charlie|     HR|  3500|         1|
|  1|  Alice|     HR|  3000|         2|
|  4|  David|     IT|  4500|         1|
|  2|    Bob|     IT|  4000|         2|
+---+-------+-------+------+----------+



5. Select

In [12]:
print("Select: Selecting specific columns")
df1.select("name", "salary").show()

Select: Selecting specific columns
+-------+------+
|   name|salary|
+-------+------+
|  Alice|  3000|
|    Bob|  4000|
|Charlie|  3500|
|  David|  4500|
|    Eve|  5000|
+-------+------+



6. WithColumn

In [13]:
from pyspark.sql.functions import expr

print("WithColumn: Add bonus (10% of salary)")
df1.withColumn("bonus", col("salary") * 0.10).show()

WithColumn: Add bonus (10% of salary)
+---+-------+-------+------+-----+
| id|   name|   dept|salary|bonus|
+---+-------+-------+------+-----+
|  1|  Alice|     HR|  3000|300.0|
|  2|    Bob|     IT|  4000|400.0|
|  3|Charlie|     HR|  3500|350.0|
|  4|  David|     IT|  4500|450.0|
|  5|    Eve|Finance|  5000|500.0|
+---+-------+-------+------+-----+



7. Drop

In [14]:
print("Drop: Removing a column")
df1.drop("dept").show()

Drop: Removing a column
+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  1|  Alice|  3000|
|  2|    Bob|  4000|
|  3|Charlie|  3500|
|  4|  David|  4500|
|  5|    Eve|  5000|
+---+-------+------+



8. Distinct

In [15]:
print("Distinct: Unique departments")
df1.select("dept").distinct().show()

Distinct: Unique departments
+-------+
|   dept|
+-------+
|     HR|
|     IT|
|Finance|
+-------+



9. OrderBy / Sort

In [16]:
print("OrderBy: Employees sorted by salary descending")
df1.orderBy(col("salary").desc()).show()

OrderBy: Employees sorted by salary descending
+---+-------+-------+------+
| id|   name|   dept|salary|
+---+-------+-------+------+
|  5|    Eve|Finance|  5000|
|  4|  David|     IT|  4500|
|  2|    Bob|     IT|  4000|
|  3|Charlie|     HR|  3500|
|  1|  Alice|     HR|  3000|
+---+-------+-------+------+



10. Limit

In [17]:
print("Limit: Top 3 salaries")
df1.orderBy(col("salary").desc()).limit(3).show()

Limit: Top 3 salaries
+---+-----+-------+------+
| id| name|   dept|salary|
+---+-----+-------+------+
|  5|  Eve|Finance|  5000|
|  4|David|     IT|  4500|
|  2|  Bob|     IT|  4000|
+---+-----+-------+------+



11. Union

In [18]:
print("Union: Combine two DataFrames (schema must match)")
dfA = df1.select("id", "name")
dfB = df1.select("id", "name")
dfA.union(dfB).show()

Union: Combine two DataFrames (schema must match)
+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
|  4|  David|
|  5|    Eve|
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
|  4|  David|
|  5|    Eve|
+---+-------+



12. DropDuplicates

In [19]:
print("DropDuplicates: Remove duplicate rows")
df1_dup = df1.union(df1)  # introduce duplicates
df1_dup.dropDuplicates().show()

DropDuplicates: Remove duplicate rows
+---+-------+-------+------+
| id|   name|   dept|salary|
+---+-------+-------+------+
|  2|    Bob|     IT|  4000|
|  1|  Alice|     HR|  3000|
|  3|Charlie|     HR|  3500|
|  4|  David|     IT|  4500|
|  5|    Eve|Finance|  5000|
+---+-------+-------+------+



13. Replace / FillNA / DropNA

In [20]:
print("FillNA: Fill missing city values with 'Unknown'")
df_join.fillna("Unknown").show()

print("DropNA: Remove rows with null values")
df_join.dropna().show()

FillNA: Fill missing city values with 'Unknown'
+---+-------+-------+------+---------+
| id|   name|   dept|salary|     city|
+---+-------+-------+------+---------+
|  1|  Alice|     HR|  3000|  Chennai|
|  2|    Bob|     IT|  4000|Bangalore|
|  5|    Eve|Finance|  5000|  Unknown|
|  3|Charlie|     HR|  3500|    Delhi|
|  4|  David|     IT|  4500|  Unknown|
+---+-------+-------+------+---------+

DropNA: Remove rows with null values
+---+-------+----+------+---------+
| id|   name|dept|salary|     city|
+---+-------+----+------+---------+
|  1|  Alice|  HR|  3000|  Chennai|
|  2|    Bob|  IT|  4000|Bangalore|
|  3|Charlie|  HR|  3500|    Delhi|
+---+-------+----+------+---------+



14. Explode (for nested data/arrays)

In [21]:
from pyspark.sql.functions import explode

array_df = spark.createDataFrame([
    (1, ["Python", "Spark"]),
    (2, ["Java"]),
    (3, [])
], ["id", "skills"])

print("Explode: Flatten array column")
array_df.withColumn("skill", explode(col("skills"))).show()

Explode: Flatten array column
+---+---------------+------+
| id|         skills| skill|
+---+---------------+------+
|  1|[Python, Spark]|Python|
|  1|[Python, Spark]| Spark|
|  2|         [Java]|  Java|
+---+---------------+------+



### Actions

15. first()

In [22]:
print("First: Get the first row")
print(df1.first())

First: Get the first row
Row(id=1, name='Alice', dept='HR', salary=3000)


16. head(n)

In [23]:
print("Head: First 3 rows")
print(df1.head(3))

Head: First 3 rows
[Row(id=1, name='Alice', dept='HR', salary=3000), Row(id=2, name='Bob', dept='IT', salary=4000), Row(id=3, name='Charlie', dept='HR', salary=3500)]


17. take(n)

In [24]:
print("Take: Take 2 rows as list")
print(df1.take(2))

Take: Take 2 rows as list
[Row(id=1, name='Alice', dept='HR', salary=3000), Row(id=2, name='Bob', dept='IT', salary=4000)]


18. Show, count, collect

In [11]:
print("Count Action: Total number of employees =", df1.count())
print("Collect Action: Collect all employee names =", [row["name"] for row in df1.select("name").collect()])

Count Action: Total number of employees = 5
Collect Action: Collect all employee names = ['Alice', 'Bob', 'Charlie', 'David', 'Eve']


19. write()

In [25]:
print("Write: Save as CSV (uncomment to run)")
# df1.write.csv("output/employees.csv", header=True)

Write: Save as CSV (uncomment to run)
