In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count

# Create a Spark session
spark = SparkSession.builder \
    .appName("Sample Transformations") \
    .getOrCreate()

# Sample data
data = [
    (1, "Alice", 34, "HR"),
    (2, "Bob", 45, "Finance"),
    (3, "Cathy", 29, "HR"),
    (4, "David", 54, "Finance"),
    (5, "Eva", 23, "IT")
]

columns = ["ID", "Name", "Age", "Department"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# 1. Select specific columns
df_select = df.select("Name", "Age")
df_select.show()



+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
|David| 54|
|  Eva| 23|
+-----+---+



In [2]:

# 2. Filter rows
df_filter = df.filter(col("Age") > 30)
df_filter.show()


+---+-----+---+----------+
| ID| Name|Age|Department|
+---+-----+---+----------+
|  1|Alice| 34|        HR|
|  2|  Bob| 45|   Finance|
|  4|David| 54|   Finance|
+---+-----+---+----------+



In [3]:

# 3. Add new column
df_with_column = df.withColumn("AgeGroup", when(col("Age") > 30, "Senior").otherwise("Junior"))
df_with_column.show()





+---+-----+---+----------+--------+
| ID| Name|Age|Department|AgeGroup|
+---+-----+---+----------+--------+
|  1|Alice| 34|        HR|  Senior|
|  2|  Bob| 45|   Finance|  Senior|
|  3|Cathy| 29|        HR|  Junior|
|  4|David| 54|   Finance|  Senior|
|  5|  Eva| 23|        IT|  Junior|
+---+-----+---+----------+--------+



In [4]:
# 4. Drop a column
df_drop = df_with_column.drop("Department")
df_drop.show()


+---+-----+---+--------+
| ID| Name|Age|AgeGroup|
+---+-----+---+--------+
|  1|Alice| 34|  Senior|
|  2|  Bob| 45|  Senior|
|  3|Cathy| 29|  Junior|
|  4|David| 54|  Senior|
|  5|  Eva| 23|  Junior|
+---+-----+---+--------+



In [5]:
# 5. Distinct rows
df_distinct = df.dropDuplicates(["Department"])
df_distinct.show()



+---+-----+---+----------+
| ID| Name|Age|Department|
+---+-----+---+----------+
|  2|  Bob| 45|   Finance|
|  1|Alice| 34|        HR|
|  5|  Eva| 23|        IT|
+---+-----+---+----------+



In [6]:
# 6. Order by column
df_order = df.orderBy(col("Age"))
df_order.show()



+---+-----+---+----------+
| ID| Name|Age|Department|
+---+-----+---+----------+
|  5|  Eva| 23|        IT|
|  3|Cathy| 29|        HR|
|  1|Alice| 34|        HR|
|  2|  Bob| 45|   Finance|
|  4|David| 54|   Finance|
+---+-----+---+----------+



In [7]:
# 7. Group by and aggregate
df_group = df.groupBy("Department").agg(count("ID").alias("Count"))
df_group.show()



+----------+-----+
|Department|Count|
+----------+-----+
|        HR|    2|
|   Finance|    2|
|        IT|    1|
+----------+-----+



In [8]:
# 8. Join DataFrames
df_join = df_with_column.join(df_group, "Department", "inner")
df_join.show()



+----------+---+-----+---+--------+-----+
|Department| ID| Name|Age|AgeGroup|Count|
+----------+---+-----+---+--------+-----+
|        HR|  1|Alice| 34|  Senior|    2|
|   Finance|  2|  Bob| 45|  Senior|    2|
|        HR|  3|Cathy| 29|  Junior|    2|
|   Finance|  4|David| 54|  Senior|    2|
|        IT|  5|  Eva| 23|  Junior|    1|
+----------+---+-----+---+--------+-----+



In [9]:
# 9. Limit rows
df_limit = df.limit(3)
df_limit.show()

+---+-----+---+----------+
| ID| Name|Age|Department|
+---+-----+---+----------+
|  1|Alice| 34|        HR|
|  2|  Bob| 45|   Finance|
|  3|Cathy| 29|        HR|
+---+-----+---+----------+

