**1. Apply Transformations & actions in pyspark**

**Transformations**

In [1]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
    .appName("RDD Transformations in PySpark") \
    .getOrCreate()

# Create sample RDDs
rdd1 = spark.sparkContext.parallelize([("a", 1), ("b", 2), ("a", 3), ("c", 1)])
rdd2 = spark.sparkContext.parallelize([("a", 5), ("b", 3), ("d", 7)])


1. map

In [2]:
# map: Multiply each value by 10
mapped_rdd = rdd1.map(lambda x: (x[0], x[1] * 10))
print("Mapped RDD:", mapped_rdd.collect())


Mapped RDD: [('a', 10), ('b', 20), ('a', 30), ('c', 10)]


 2. flatMap

In [4]:
# flatMap: Split each key into characters
flat_mapped_rdd = rdd1.flatMap(lambda x: [(char, x[1]) for char in x[0]])
print("FlatMapped RDD:", flat_mapped_rdd.collect())


FlatMapped RDD: [('a', 1), ('b', 2), ('a', 3), ('c', 1)]


3. reduceByKey

In [6]:
# reduceByKey: Sum values by key
reduced_rdd = rdd1.reduceByKey(lambda a, b: a + b)
print("Reduced RDD:", reduced_rdd.collect())


Reduced RDD: [('b', 2), ('c', 1), ('a', 4)]


 4. union

In [7]:
# union: Merge two RDDs
union_rdd = rdd1.union(rdd2)
print("Union RDD:", union_rdd.collect())


Union RDD: [('a', 1), ('b', 2), ('a', 3), ('c', 1), ('a', 5), ('b', 3), ('d', 7)]


5. distinct

In [8]:
# distinct: Remove duplicate records
distinct_rdd = union_rdd.distinct()
print("Distinct RDD:", distinct_rdd.collect())


Distinct RDD: [('a', 3), ('b', 3), ('a', 1), ('b', 2), ('a', 5), ('c', 1), ('d', 7)]


**Action**

 1. collect()

In [9]:
# collect(): Returns entire RDD to driver
collected_data = rdd1.collect()
print("Collected Data:", collected_data)


Collected Data: [('a', 1), ('b', 2), ('a', 3), ('c', 1)]


2. count()

In [10]:
# count(): Total number of elements
count_elements = rdd1.count()
print("Total Elements:", count_elements)


Total Elements: 4


3. first()

In [11]:
# first(): Returns the first element
first_element = rdd1.first()
print("First Element:", first_element)


First Element: ('a', 1)


 4. max()

In [14]:
# max(): Maximum value — you must define how to compare the items
max_element = rdd1.max(key=lambda x: x[1])
print("Max Element:", max_element)

Max Element: ('a', 3)


5. reduce()

In [13]:
# reduce(): Combine all values (just values, not key-value)
rdd_values = rdd1.map(lambda x: x[1])  # get just the values
reduced_value = rdd_values.reduce(lambda a, b: a + b)
print("Reduced Sum of Values:", reduced_value)


Reduced Sum of Values: 7


6. take(n)

In [15]:
# take(n): Returns first n elements
top_3 = rdd1.take(3)
print("Top 3 Elements:", top_3)


Top 3 Elements: [('a', 1), ('b', 2), ('a', 3)]


 7. saveAsTextFile(path)

In [16]:
# Create RDD
rdd = spark.sparkContext.parallelize([("x", 100), ("y", 200), ("z", 300)])
# Save RDD to text file (each record will be saved as a line)
rdd.saveAsTextFile("output_rdd")
print("RDD saved successfully to 'output_rdd' directory.")



RDD saved successfully to 'output_rdd' directory.


**2. Transformations such as Filter, Join, Simple Aggregations, GroupBy, Window functions etc.**

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, avg, row_number
from pyspark.sql.window import Window

# Create SparkSession
spark = SparkSession.builder.appName("DataFrame Transformations").getOrCreate()

# Sample data
data = [
    (1, "Arjun", "HR", 3000),
    (2, "Ramya", "IT", 4000),
    (3, "Chandhini", "HR", 3500),
    (4, "Devi", "Finance", 3800),
    (5, "Ramesh", "IT", 4200),
    (6, "Franklin", "Finance", 3900)
]

columns = ["id", "name", "dept", "salary"]

df = spark.createDataFrame(data, schema=columns)
df.show()


+---+---------+-------+------+
| id|     name|   dept|salary|
+---+---------+-------+------+
|  1|    Arjun|     HR|  3000|
|  2|    Ramya|     IT|  4000|
|  3|Chandhini|     HR|  3500|
|  4|     Devi|Finance|  3800|
|  5|   Ramesh|     IT|  4200|
|  6| Franklin|Finance|  3900|
+---+---------+-------+------+



1. Filter

In [19]:
# Filter: Employees with salary > 4000
filtered_df = df.filter(col("salary") > 4000)
filtered_df.show()


+---+------+----+------+
| id|  name|dept|salary|
+---+------+----+------+
|  5|Ramesh|  IT|  4200|
+---+------+----+------+



 2. Join

In [21]:
# Create another DataFrame for joining
dept_data = [("HR", "Pallavaram"), ("IT", "Tambaram"), ("Finance", "Chrompet")]
dept_columns = ["dept", "location"]

dept_df = spark.createDataFrame(dept_data, schema=dept_columns)

# Join on "dept"
joined_df = df.join(dept_df, on="dept", how="inner")
joined_df.show()


+-------+---+---------+------+----------+
|   dept| id|     name|salary|  location|
+-------+---+---------+------+----------+
|Finance|  4|     Devi|  3800|  Chrompet|
|Finance|  6| Franklin|  3900|  Chrompet|
|     HR|  1|    Arjun|  3000|Pallavaram|
|     HR|  3|Chandhini|  3500|Pallavaram|
|     IT|  2|    Ramya|  4000|  Tambaram|
|     IT|  5|   Ramesh|  4200|  Tambaram|
+-------+---+---------+------+----------+



 3. Simple Aggregations

In [22]:
# Calculate total and average salary
agg_df = df.agg(
    _sum("salary").alias("Total_Salary"),
    avg("salary").alias("Average_Salary")
)
agg_df.show()


+------------+------------------+
|Total_Salary|    Average_Salary|
+------------+------------------+
|       22400|3733.3333333333335|
+------------+------------------+



 4. GroupBy

In [23]:
# Group by department and calculate average salary
grouped_df = df.groupBy("dept").agg(
    avg("salary").alias("Avg_Salary")
)
grouped_df.show()


+-------+----------+
|   dept|Avg_Salary|
+-------+----------+
|     HR|    3250.0|
|     IT|    4100.0|
|Finance|    3850.0|
+-------+----------+



5. Window Functions

In [24]:
# Add row_number based on salary within each department
window_spec = Window.partitionBy("dept").orderBy(col("salary").desc())

df_with_rank = df.withColumn("row_number", row_number().over(window_spec))
df_with_rank.show()


+---+---------+-------+------+----------+
| id|     name|   dept|salary|row_number|
+---+---------+-------+------+----------+
|  6| Franklin|Finance|  3900|         1|
|  4|     Devi|Finance|  3800|         2|
|  3|Chandhini|     HR|  3500|         1|
|  1|    Arjun|     HR|  3000|         2|
|  5|   Ramesh|     IT|  4200|         1|
|  2|    Ramya|     IT|  4000|         2|
+---+---------+-------+------+----------+

