In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Initialize Spark session
spark = SparkSession.builder \
    .appName("PySpark Transformation Example") \
    .getOrCreate()
# Create a sample DataFrame
data = [
    (1, "Alice", 30),
    (2, "Bob", 25),
    (3, "Catherine", 29),
    (4, "David", 35)
]
columns = ["id", "name", "age"]
df = spark.createDataFrame(data, columns)
print("Original DataFrame:")
df.show()
# Apply a filter transformation: keep rows where age is greater than 28
filtered_df = df.filter(col("age") > 28)
# Apply a withColumn transformation: add a new column 'is_adult' which is True for all
transformed_df = filtered_df.withColumn("is_adult", col("age") >= 18)
# Show the transformed DataFrame
print("Transformed DataFrame:")
transformed_df.show()



Original DataFrame:
+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 30|
|  2|      Bob| 25|
|  3|Catherine| 29|
|  4|    David| 35|
+---+---------+---+

Transformed DataFrame:
+---+---------+---+--------+
| id|     name|age|is_adult|
+---+---------+---+--------+
|  1|    Alice| 30|    true|
|  3|Catherine| 29|    true|
|  4|    David| 35|    true|
+---+---------+---+--------+



In [12]:
#q2
print("DataFrame:")
df.show()
# Count the number of rows in the DataFrame
row_count = df.count()
print(f"\nNumber of rows in the DataFrame: {row_count}")


DataFrame:
+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 30|
|  2|      Bob| 25|
|  3|Catherine| 29|
|  4|    David| 35|
+---+---------+---+


Number of rows in the DataFrame: 4


In [13]:
#Q3
from pyspark.sql.functions import sum as sum_col, avg
print("DataFrame:")
df.show()
# Perform basic aggregations
# Calculate the sum of the 'age' column
age_sum = df.agg(sum_col("age").alias("total_age")).collect()[0]["total_age"]
print(f"\nSum of ages: {age_sum}")
# Calculate the average of the 'age' column
age_avg = df.agg(avg("age").alias("average_age")).collect()[0]["average_age"]
print(f"Average age: {age_avg}")



DataFrame:
+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 30|
|  2|      Bob| 25|
|  3|Catherine| 29|
|  4|    David| 35|
+---+---------+---+


Sum of ages: 119
Average age: 29.75


In [15]:
#Q4
# Show the DataFrame
print("DataFrame:")
df.show()

# Define the path where the CSV file will be saved
csv_path = "path/to/your/directory/data.csv"

# Write the DataFrame to a CSV file
df.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(csv_path)

print(f"\nDataFrame written to CSV file at: {csv_path}")


DataFrame:
+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1|    Alice| 30|
|  2|      Bob| 25|
|  3|Catherine| 29|
|  4|    David| 35|
+---+---------+---+


DataFrame written to CSV file at: path/to/your/directory/data.csv


In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()

# Read the text file into a DataFrame
# Note: Replace '/absolute/path/to/your/textfile.txt' with the actual path to your text file
text_file_path = '/home/lplab/Desktop/220962019/lab2/week2.txt'
df = spark.read.text(text_file_path)

# Show the content of the DataFrame
print("Content of the text file:")
df.show(truncate=False)

# Split each line into words
words_df = df.select(explode(split(col("value"), " ")).alias("word"))

# Show the words DataFrame
print("Words DataFrame:")
words_df.show(truncate=False)

# Count the occurrences of each word
word_count_df = words_df.groupBy("word").count()

# Show the word count DataFrame
print("Word Count:")
word_count_df.show(truncate=False)

# Optionally, write the results to a CSV file
# Replace '/absolute/path/to/output' with your desired output path
output_path = '/home/lplab/Desktop/220962019/lab2/week2modify.txt'
word_count_df.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(output_path)

print(f"\nWord count results written to CSV file at: {output_path}")

# Stop the Spark session
spark.stop()



Content of the text file:
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                            

+--------------------+-----+
|word                |count|
+--------------------+-----+
|positive            |1    |
|Russia              |1    |
|prevent             |1    |
|oversee             |1    |
|CONCACAF            |1    |
|International       |1    |
|(lit. 'International|1    |
|among               |2    |
|Qatar,              |1    |
|World               |5    |
|suspended           |1    |
|Madrid              |1    |
|Cups.               |1    |
|play.[5]            |1    |
|organizational      |1    |
|Federation          |1    |
|FIFA's              |1    |
|11                  |1    |
|(Oceania),          |1    |
|set                 |1    |
+--------------------+-----+
only showing top 20 rows


Word count results written to CSV file at: /home/lplab/Desktop/220962019/lab2/week2modify.txt
