In [31]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, sum as sum_, avg, explode, split, col

In [33]:
#Question 1: Implement a PySpark script that applies transformations like filter and withColumn on a Dataframe.
spark = SparkSession.builder \
    .appName("Lab 02 Questions") \
    .getOrCreate()

data = [
    ("John", 30, "Male", 1000),
    ("Jane", 25, "Female", 1500),
    ("Sam", 28, "Male", 1200),
    ("Anna", 22, "Female", 800),
    ("Mike", 35, "Male", 2000)
]

columns = ["Name", "Age", "Gender","Salary"]

df = spark.createDataFrame(data, columns)

print("Original DataFrame:")
df.show()

#filtering based on age
filtered_df = df.filter(col("Age") > 25)

#adding column with the function withColumn
transformed_df = filtered_df.withColumn(
    "Age_Group",
    when(col("Age") > 30, lit("Senior")).otherwise(lit("Adult"))
)

print("Transformed DataFrame:")
transformed_df.show()



Original DataFrame:
+----+---+------+------+
|Name|Age|Gender|Salary|
+----+---+------+------+
|John| 30|  Male|  1000|
|Jane| 25|Female|  1500|
| Sam| 28|  Male|  1200|
|Anna| 22|Female|   800|
|Mike| 35|  Male|  2000|
+----+---+------+------+

Transformed DataFrame:
+----+---+------+------+---------+
|Name|Age|Gender|Salary|Age_Group|
+----+---+------+------+---------+
|John| 30|  Male|  1000|    Adult|
| Sam| 28|  Male|  1200|    Adult|
|Mike| 35|  Male|  2000|   Senior|
+----+---+------+------+---------+



In [34]:
#Question 2: Write a PySpark script that performs actions like count and show on a DataFrame.

# Show the DataFrame
print("Showing DataFrame:")
df.show()

# Count the number of rows in the DataFrame
row_count = df.count()
print(f"\nNumber of rows in DataFrame: {row_count}")

Showing DataFrame:
+----+---+------+------+
|Name|Age|Gender|Salary|
+----+---+------+------+
|John| 30|  Male|  1000|
|Jane| 25|Female|  1500|
| Sam| 28|  Male|  1200|
|Anna| 22|Female|   800|
|Mike| 35|  Male|  2000|
+----+---+------+------+


Number of rows in DataFrame: 5


In [35]:
#Question 3: Demonstrate how to perform basic aggregations (e.g., sum, average) on a PySpark DataFrame.

# Calculate the sum of the 'Salary' column
total_salary = df.agg(sum_("Salary").alias("Total_Salary")).collect()[0]["Total_Salary"]
print(f"Total Salary: {total_salary}")

# Calculate the average of the 'Salary' column
average_salary = df.agg(avg("Salary").alias("Average_Salary")).collect()[0]["Average_Salary"]
print(f"Average Salary: {average_salary}")


Total Salary: 6500
Average Salary: 1300.0


In [36]:
#Question 4: Show how to write a PySpark DataFrame to a CSV file.

# Define the path to the CSV file
output_path = "output_data.csv"

# Write the DataFrame to a CSV file (writes it in different files per row)
df.write \
    .format("csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save(output_path)

print(f"\nDataFrame written to {output_path}")

#You can use .coalesce() to write to one file
output_path = "output_data_single.csv"
df.coalesce(1).write \
    .format("csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save(output_path)

print(f"\nDataFrame written to {output_path}")


DataFrame written to output_data.csv

DataFrame written to output_data_single.csv


In [37]:
#Question 5: Implement wordcount program in PySpark.
input_file_path = "input_text.txt"  # Replace with the path to your input text file

df = spark.read.text(input_file_path)

print("Original DataFrame:")
df.show(truncate=False)

# Perform WordCount
# Split each line into words, explode to create a row for each word, and count the occurrences
word_counts = df.select(
    explode(
        split(col("value"), r"\s+")
    ).alias("word")
).groupBy("word").count()

# Show the WordCount results
print("\nWord Count Results:")
word_counts.show(truncate=False)

spark.stop()

Original DataFrame:
+------------------------+
|value                   |
+------------------------+
|Hello World             |
|Hello pyspark           |
|Welcome to pyspark World|
+------------------------+


Word Count Results:
+-------+-----+
|word   |count|
+-------+-----+
|World  |2    |
|Hello  |2    |
|pyspark|2    |
|Welcome|1    |
|to     |1    |
+-------+-----+

