In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder.appName("TransformationsExample").getOrCreate()

# Sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 22), ("David", 35)]
columns = ["Name", "Age"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Display the original DataFrame
print("Original DataFrame:")
df.show()

# Apply transformations: filter and withColumn

# 1. Filter rows where age is greater than 25
filtered_df = df.filter(col("Age") > 25)

# 2. Add a new column "AgePlusTwo" with each age increased by 2
transformed_df = filtered_df.withColumn("AgePlusTwo", col("Age") + 2)

# Display the transformed DataFrame
print("\nTransformed DataFrame:")
transformed_df.show()

# Stop the Spark session
spark.stop()




Original DataFrame:
+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 22|
|  David| 35|
+-------+---+


Transformed DataFrame:
+-----+---+----------+
| Name|Age|AgePlusTwo|
+-----+---+----------+
|  Bob| 30|        32|
|David| 35|        37|
+-----+---+----------+



In [2]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("ActionsExample").getOrCreate()

# Sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 22), ("David", 35)]
columns = ["Name", "Age"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Display the original DataFrame
print("Original DataFrame:")
df.show()

# Perform actions: count and show

# 1. Count the number of rows in the DataFrame
row_count = df.count()
print("\nNumber of rows in the DataFrame: {}".format(row_count))

# 2. Show the first 2 rows of the DataFrame
print("\nFirst 2 rows of the DataFrame:")
df.show(2)

# Stop the Spark session
spark.stop()




Original DataFrame:
+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 22|
|  David| 35|
+-------+---+


Number of rows in the DataFrame: 4

First 2 rows of the DataFrame:
+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
|  Bob| 30|
+-----+---+
only showing top 2 rows



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum

# Create a Spark session
spark = SparkSession.builder.appName("AggregationsExample").getOrCreate()

# Sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 22), ("David", 35)]
columns = ["Name", "Age"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Display the original DataFrame
print("Original DataFrame:")
df.show()

# Perform basic aggregations: sum and average

# 1. Calculate the sum of ages
total_age = df.agg(sum("Age").alias("TotalAge")).collect()[0]["TotalAge"]
print("\nTotal sum of ages: {}".format(total_age))

# 2. Calculate the average age
average_age = df.agg(avg("Age").alias("AverageAge")).collect()[0]["AverageAge"]
print("Average age: {:.2f}".format(average_age))

# Stop the Spark session
spark.stop()




Original DataFrame:
+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 22|
|  David| 35|
+-------+---+


Total sum of ages: 112
Average age: 28.00


In [4]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("WriteToCSVExample").getOrCreate()

# Sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 22), ("David", 35)]
columns = ["Name", "Age"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Display the original DataFrame
print("Original DataFrame:")
df.show()

# Write DataFrame to a CSV file
output_path = "output_csv"
df.write.csv(output_path, header=True, mode="overwrite")

# Stop the Spark session
spark.stop()




Original DataFrame:
+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 22|
|  David| 35|
+-------+---+



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, count

# Create a Spark session
spark = SparkSession.builder.appName("WordCountExample").getOrCreate()

# Sample data (text)
text_data = ["Hello world", "PySpark is awesome", "Spark is powerful"]

# Create a DataFrame from the text data
df = spark.createDataFrame([(line,) for line in text_data], ["text"])

# Display the original DataFrame
print("Original DataFrame:")
df.show(truncate=False)

# Perform word count using DataFrame transformations and actions

# 1. Split the text into words
df_words = df.select(explode(split(df["text"], " ")).alias("word"))

# 2. Perform word count
word_count = df_words.groupBy("word").agg(count("word").alias("count"))

# Display the word count results
print("\nWord Count Results:")
word_count.show()

# Stop the Spark session
spark.stop()




Original DataFrame:
+------------------+
|text              |
+------------------+
|Hello world       |
|PySpark is awesome|
|Spark is powerful |
+------------------+


Word Count Results:
+--------+-----+
|    word|count|
+--------+-----+
|   Hello|    1|
|   world|    1|
|      is|    2|
| PySpark|    1|
| awesome|    1|
|powerful|    1|
|   Spark|    1|
+--------+-----+



In [6]:
from pyspark import SparkContext, SparkConf

# Create a Spark configuration and context
conf = SparkConf().setAppName("WordCountExampleRDD")
sc = SparkContext(conf=conf)

# Sample data (text)
text_data = ["Hello world", "PySpark is awesome", "Spark is powerful"]

# Create an RDD from the text data
rdd = sc.parallelize(text_data)

# Perform word count using RDD transformations and actions

# 1. Split the text into words
words = rdd.flatMap(lambda line: line.split(" "))

# 2. Assign a count of 1 to each word
word_counts = words.map(lambda word: (word, 1))

# 3. Perform word count by summing up the counts
word_count_results = word_counts.reduceByKey(lambda x, y: x + y)

# Display the word count results
print("Word Count Results:")
for word, count in word_count_results.collect():
    print(f"{word}: {count}")

# Stop the Spark context
sc.stop()




Word Count Results:
awesome: 1
powerful: 1
Hello: 1
PySpark: 1
Spark: 1
world: 1
is: 2
