In [None]:
1. Working with RDDs:
 a) Write a Python program to create an RDD from a local data source.
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Creation")

# Local data source
data = [1, 2, 3, 4, 5]

# Create an RDD from the local data source
rdd = sc.parallelize(data)

# Perform operations on the RDD
result = rdd.map(lambda x: x * 2).collect()

# Print the result
for num in result:
    print(num)

# Stop the SparkContext
sc.stop()


In [None]:
 b) Implement transformations and actions on the RDD to perform data processing tasks.
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Transformations and Actions")

# Local data source
data = [1, 2, 3, 4, 5]

# Create an RDD from the local data source
rdd = sc.parallelize(data)

# Perform transformations and actions on the RDD
squared_rdd = rdd.map(lambda x: x ** 2)  # Square each element
filtered_rdd = squared_rdd.filter(lambda x: x > 10)  # Filter elements greater than 10
sum_result = filtered_rdd.reduce(lambda x, y: x + y)  # Calculate the sum of remaining elements

# Print the results
print("Squared RDD:")
for num in squared_rdd.collect():
    print(num)

print("\nFiltered RDD:")
for num in filtered_rdd.collect():
    print(num)

print("\nSum Result:", sum_result)

# Stop the SparkContext
sc.stop()

In [None]:
 c) Analyze and manipulate data using RDD operations such as map, filter, reduce, or aggregate

 from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Operations")

# Local data source
data = [1, 2, 3, 4, 5]

# Create an RDD from the local data source
rdd = sc.parallelize(data)

# Use RDD operations to analyze and manipulate data
squared_rdd = rdd.map(lambda x: x ** 2)  # Square each element
even_rdd = squared_rdd.filter(lambda x: x % 2 == 0)  # Filter even numbers
sum_result = rdd.reduce(lambda x, y: x + y)  # Calculate the sum of all elements
product_result = rdd.reduce(lambda x, y: x * y)  # Calculate the product of all elements
mean_result = rdd.mean()  # Calculate the mean of all elements

# Print the results
print("Squared RDD:")
for num in squared_rdd.collect():
    print(num)

print("\nEven Numbers:")
for num in even_rdd.collect():
    print(num)

print("\nSum Result:", sum_result)
print("Product Result:", product_result)
print("Mean Result:", mean_result)

# Stop the SparkContext
sc.stop()

In [None]:
2. Spark DataFrame Operations:
  a) Write a Python program to load a CSV file into a Spark DataFrame.
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("CSV to DataFrame").getOrCreate()

# Load the CSV file into a DataFrame
df = spark.read.csv("path/to/your/csv/file.csv", header=True, inferSchema=True)

# Display the DataFrame
df.show()

# Perform further operations on the DataFrame
# ...

# Stop the SparkSession
spark.stop()

In [None]:
 b)Perform common DataFrame operations such as filtering, grouping, or joining.
 from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrame Operations").getOrCreate()

# Load the CSV file into a DataFrame
df = spark.read.csv("path/to/your/csv/file.csv", header=True, inferSchema=True)

# Show the DataFrame
df.show()

# Filter the DataFrame based on a condition
filtered_df = df.filter(df["age"] > 30)
filtered_df.show()

# Group the DataFrame by a column and calculate aggregates
grouped_df = df.groupBy("gender").agg({"salary": "avg", "age": "max"})
grouped_df.show()

# Join two DataFrames based on a common column
other_df = spark.read.csv("path/to/another/csv/file.csv", header=True, inferSchema=True)
joined_df = df.join(other_df, on="id", how="inner")
joined_df.show()

# Perform additional operations on the DataFrames
# ...

# Stop the SparkSession
spark.stop()


In [None]:
c) Apply Spark SQL queries on the DataFrame to extract insights from the data
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark SQL Queries").getOrCreate()

# Load the CSV file into a DataFrame
df = spark.read.csv("path/to/your/csv/file.csv", header=True, inferSchema=True)

# Register the DataFrame as a temporary table
df.createOrReplaceTempView("my_table")

# Execute SQL queries on the DataFrame
query1 = "SELECT * FROM my_table WHERE age > 30"
result1 = spark.sql(query1)
result1.show()

query2 = "SELECT gender, AVG(salary) AS avg_salary, MAX(age) AS max_age FROM my_table GROUP BY gender"
result2 = spark.sql(query2)
result2.show()

# Perform additional SQL queries on the DataFrame
# ...

# Stop the SparkSession
spark.stop()

In [None]:
3. Spark Streaming
a) Write a Python program to create a Spark Streaming application.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a SparkContext
sc = SparkContext(appName="Spark Streaming Application")

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sparkContext=sc, batchDuration=1)

# Create a DStream by connecting to a TCP socket and listening for data
lines = ssc.socketTextStream("localhost", 9999)

# Perform transformations on the DStream
words = lines.flatMap(lambda line: line.split(" "))
word_counts = words.countByValue()

# Print the word counts
word_counts.pprint()

# Start the streaming context
ssc.start()

# Await termination
ssc.awaitTermination()


In [None]:
  b) Configure the application to consume data from a streaming source (e.g., Kafka or a socket).
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

# Create a SparkContext
sc = SparkContext(appName="Spark Streaming Application")

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sparkContext=sc, batchDuration=1)

# Configure Kafka parameters
kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "my_consumer_group"
}

# Consume data from a Kafka topic
kafka_topic = "my_topic"
kafka_stream = KafkaUtils.createDirectStream(ssc, [kafka_topic], kafka_params)

# Or, consume data from a socket
socket_stream = ssc.socketTextStream("localhost", 9999)

# Perform transformations on the input DStream
lines = kafka_stream.map(lambda x: x[1])  # Extract the value from Kafka messages
words = lines.flatMap(lambda line: line.split(" "))
word_counts = words.countByValue()

# Print the word counts
word_counts.pprint()

# Start the streaming context
ssc.start()

# Await termination
ssc.awaitTermination()


In [None]:
c) Implement streaming transformations and actions to process and analyze the incoming data stream.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

# Create a SparkContext
sc = SparkContext(appName="Streaming Transformations and Actions")

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sparkContext=sc, batchDuration=1)

# Configure Kafka parameters
kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "my_consumer_group"
}

# Consume data from a Kafka topic
kafka_topic = "my_topic"
kafka_stream = KafkaUtils.createDirectStream(ssc, [kafka_topic], kafka_params)

# Extract and process the incoming data stream
lines = kafka_stream.map(lambda x: x[1])  # Extract the value from Kafka messages

# Perform transformations and actions on the DStream
word_counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Perform additional operations on the DStream
# ...

# Start the streaming context
ssc.start()

# Await termination
ssc.awaitTermination()


In [None]:
4. Spark SQL and Data Source Integration:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark-Database Integration").getOrCreate()

# Configure the MySQL database connection
url = "jdbc:mysql://localhost:3306/mydatabase"
properties = {
    "user": "your_username",
    "password": "your_password",
    "driver": "com.mysql.jdbc.Driver"
}

# Read data from the database table
table_name = "employees"
df = spark.read.jdbc(url=url, table=table_name, properties=properties)

# Display the DataFrame
df.show()

# Perform further operations on the DataFrame
# ...

# Stop the SparkSession
spark.stop()


In [None]:
 b)Perform SQL operations on the data stored in the database using Spark SQL
 from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark SQL - Database Integration").getOrCreate()

# Configure the MySQL database connection
url = "jdbc:mysql://localhost:3306/mydatabase"
properties = {
    "user": "your_username",
    "password": "your_password",
    "driver": "com.mysql.jdbc.Driver"
}

# Register the database table as a temporary view
table_name = "employees"
spark.read.jdbc(url=url, table=table_name, properties=properties).createOrReplaceTempView(table_name)

# Execute SQL queries on the table
query = "SELECT * FROM employees WHERE age > 30"
result = spark.sql(query)

# Display the query result
result.show()

# Perform additional SQL operations
# ...

# Stop the SparkSession
spark.stop()


In [None]:
c) Explore the integration capabilities of Spark with other data sources, such as Hadoop Distributed File System (HDFS) or Amazon S3.
df = spark.read.text("s3a://your_bucket/your_file.txt")
df.write.text("s3a://your_bucket/your_output.txt")