In [None]:
# Question 1
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDDExample")

# Create an RDD from a local data source
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Perform transformations and actions on the RDD
# Example 1: Map transformation - square each element
squared_rdd = rdd.map(lambda x: x ** 2)
print("Squared RDD:", squared_rdd.collect())

# Example 2: Filter transformation - keep only even numbers
even_rdd = rdd.filter(lambda x: x % 2 == 0)
print("Even RDD:", even_rdd.collect())

# Example 3: Reduce action - sum all elements
sum_of_elements = rdd.reduce(lambda x, y: x + y)
print("Sum of elements:", sum_of_elements)

# Example 4: Aggregate action - calculate sum and count of elements
sum_count = rdd.aggregate((0, 0),
                          lambda acc, value: (acc[0] + value, acc[1] + 1),
                          lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))
print("Sum:", sum_count[0])
print("Count:", sum_count[1])

# Close the SparkContext
sc.stop()

In [None]:
# Question 2
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()

# Load a CSV file into a DataFrame
df = spark.read.csv("file_path.csv", header=True, inferSchema=True)

# Perform common DataFrame operations
# Example 1: Filter rows based on a condition
filtered_df = df.filter(df["age"] > 30)

# Example 2: Group by a column and calculate average
average_age_df = df.groupBy("department").avg("age")

# Example 3: Join two DataFrames
employees_df = df.select("employee_id", "name")
departments_df = df.select("employee_id", "department")
joined_df = employees_df.join(departments_df, "employee_id")

# Apply Spark SQL queries on the DataFrame
# Example 1: Register DataFrame as a temporary view
df.createOrReplaceTempView("employees")

# Example 2: Execute SQL query on the DataFrame
result = spark.sql("SELECT * FROM employees WHERE age > 30")

# Show the result
result.show()

# Stop the SparkSession
spark.stop()

In [None]:
# Question 3
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

# Create a Spark Streaming Context
spark_streaming_context = StreamingContext(spark_context, batchDuration=1)

# Configure the streaming application to consume data from a streaming source
kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "key.deserializer": "org.apache.kafka.common.serialization.StringDeserializer",
    "value.deserializer": "org.apache.kafka.common.serialization.StringDeserializer",
    "group.id": "spark-streaming-group"
}
kafka_topic = "input_topic"

# Create a DStream by connecting to the Kafka source
kafka_stream = KafkaUtils.createDirectStream(
    spark_streaming_context, [kafka_topic], kafka_params
)

# Implement streaming transformations and actions
# Example: Word count from the input stream
word_counts = kafka_stream.flatMap(lambda x: x[1].split(" ")).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

# Print the word counts
word_counts.pprint()

# Start the streaming application
spark_streaming_context.start()

# Wait for the streaming application to terminate
spark_streaming_context.awaitTermination()

In [None]:
# Question 4
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Spark SQL Database Integration") \
    .config("spark.sql.catalogImplementation", "hive") \
    .getOrCreate()

# Connect Spark with a relational database (MySQL in this example)
url = "jdbc:mysql://localhost:3306/mydatabase"
properties = {
    "user": "username",
    "password": "password",
    "driver": "com.mysql.jdbc.Driver"
}

# Read data from the database table using Spark SQL
df = spark.read.jdbc(url=url, table="mytable", properties=properties)

# Perform SQL operations on the data
df.createOrReplaceTempView("mytable")
result = spark.sql("SELECT * FROM mytable WHERE age > 30")

# Explore integration with other data sources
# Example: Read data from HDFS or Amazon S3
hdfs_path = "hdfs://localhost:9000/path/to/data"
df_hdfs = spark.read.format("csv").load(hdfs_path)

s3_path = "s3a://bucket/path/to/data"
df_s3 = spark.read.format("parquet").load(s3_path)

# Display the results
result.show()
df_hdfs.show()
df_s3.show()

# Stop the SparkSession
spark.stop()