### Spark Assignment

1. Working with RDDs:
   a) Write a Python program to create an RDD from a local data source.
   b) Implement transformations and actions on the RDD to perform data processing tasks.
   c) Analyze and manipulate data using RDD operations such as map, filter, reduce, or aggregate.


In [None]:
# a) Write a Python program to create an RDD from a local data source.

from pyspark.sql import SparkSession

def create_rdd_from_local_data():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("CreateRDDFromLocalData") \
        .getOrCreate()

    # Your local data (example data)
    data = [1, 2, 3, 4, 5]

    # Create an RDD from the local data
    rdd = spark.sparkContext.parallelize(data)

    # Print the RDD elements
    rdd_elements = rdd.collect()
    print("RDD Elements:", rdd_elements)

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    create_rdd_from_local_data()

    
# b) Implement transformations and actions on the RDD to perform data processing tasks.

from pyspark.sql import SparkSession

def perform_data_processing_tasks():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("DataProcessingTasks") \
        .getOrCreate()

    # Example data (list of tuples)
    data = [
        ("Alice", 25),
        ("Bob", 30),
        ("Charlie", 22),
        ("David", 28),
        ("Eva", 35)
    ]

    # Create an RDD from the local data
    rdd = spark.sparkContext.parallelize(data)

    # Transformation: Filter the RDD to get people younger than 30
    rdd_filtered = rdd.filter(lambda x: x[1] < 30)

    # Transformation: Map the RDD to create a new RDD with names only
    rdd_names = rdd.map(lambda x: x[0])

    # Transformation: Sort the RDD based on age in descending order
    rdd_sorted = rdd.sortBy(lambda x: x[1], ascending=False)

    # Action: Count the number of elements in the RDD
    count = rdd.count()

    # Action: Collect the elements of the RDD as a list
    elements = rdd.collect()

    # Action: Calculate the sum of ages in the RDD
    sum_of_ages = rdd.map(lambda x: x[1]).sum()

    # Action: Find the maximum age in the RDD
    max_age = rdd.map(lambda x: x[1]).max()

    # Action: Print the RDD elements after the transformations
    print("Filtered RDD (Younger than 30):", rdd_filtered.collect())
    print("Names RDD:", rdd_names.collect())
    print("Sorted RDD:", rdd_sorted.collect())

    # Print the results of the actions
    print("Number of elements in RDD:", count)
    print("All elements in RDD:", elements)
    print("Sum of ages in RDD:", sum_of_ages)
    print("Maximum age in RDD:", max_age)

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    perform_data_processing_tasks()

# c) Analyze and manipulate data using RDD operations such as map, filter, reduce, or aggregate.

from pyspark.sql import SparkSession

def analyze_and_manipulate_data():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("DataAnalysisAndManipulation") \
        .getOrCreate()

    # Example data (list of tuples)
    data = [
        ("Alice", 25),
        ("Bob", 30),
        ("Charlie", 22),
        ("David", 28),
        ("Eva", 35)
    ]

    # Create an RDD from the local data
    rdd = spark.sparkContext.parallelize(data)

    # Transformation: Map the RDD to create a new RDD with age squared
    rdd_age_squared = rdd.map(lambda x: (x[0], x[1]**2))

    # Transformation: Filter the RDD to get people whose name starts with 'A'
    rdd_names_with_a = rdd.filter(lambda x: x[0].startswith('A'))

    # Action: Calculate the sum of ages using reduce
    total_age = rdd.map(lambda x: x[1]).reduce(lambda x, y: x + y)

    # Action: Calculate the average age using aggregate
    total_and_count = rdd.aggregate((0, 0), lambda acc, x: (acc[0] + x[1], acc[1] + 1),
                                    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))
    average_age = total_and_count[0] / total_and_count[1]

    # Print the results of the transformations and actions
    print("RDD with age squared:", rdd_age_squared.collect())
    print("RDD with names starting with 'A':", rdd_names_with_a.collect())
    print("Total age:", total_age)
    print("Average age:", average_age)

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    analyze_and_manipulate_data()

2. Spark DataFrame Operations:
   a) Write a Python program to load a CSV file into a Spark DataFrame.
   b)Perform common DataFrame operations such as filtering, grouping, or joining.
   c) Apply Spark SQL queries on the DataFrame to extract insights from the data.


In [None]:
# a) Write a Python program to load a CSV file into a Spark DataFrame.

from pyspark.sql import SparkSession

def load_csv_into_dataframe(file_path):
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("LoadCSVIntoDataFrame") \
        .getOrCreate()

    # Load the CSV file into a DataFrame
    df = spark.read.csv(file_path, header=True, inferSchema=True)

    # Print the schema and first few rows of the DataFrame
    df.printSchema()
    df.show()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    # Replace 'your_csv_file_path' with the actual path to your CSV file
    csv_file_path = 'your_csv_file_path'
    load_csv_into_dataframe(csv_file_path)

#    b)Perform common DataFrame operations such as filtering, grouping, or joining.

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

def perform_dataframe_operations(file_path):
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("DataFrameOperations") \
        .getOrCreate()

    # Load the CSV file into a DataFrame
    df = spark.read.csv(file_path, header=True, inferSchema=True)

    # Filter the DataFrame to get rows where age is greater than 30
    df_filtered = df.filter(df['age'] > 30)

    # Group the DataFrame by the 'gender' column and calculate the average age per gender
    df_grouped = df.groupBy('gender').avg('age')

    # Join the DataFrame with another DataFrame on the 'user_id' column
    # Assuming you have another DataFrame 'other_df' with 'user_id' and 'address' columns
    # other_df = spark.read.csv('other_csv_file_path', header=True, inferSchema=True)
    # df_joined = df.join(other_df, on='user_id', how='inner')
    
    # Print the filtered DataFrame
    print("Filtered DataFrame:")
    df_filtered.show()

    # Print the grouped DataFrame
    print("Grouped DataFrame:")
    df_grouped.show()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    # Replace 'your_csv_file_path' with the actual path to your CSV file
    csv_file_path = 'your_csv_file_path'
    perform_dataframe_operations(csv_file_path)

# c) Apply Spark SQL queries on the DataFrame to extract insights from the data.

from pyspark.sql import SparkSession

def apply_spark_sql_queries(file_path):
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("SparkSQLQueries") \
        .getOrCreate()

    # Load the CSV file into a DataFrame
    df = spark.read.csv(file_path, header=True, inferSchema=True)

    # Register the DataFrame as a temporary SQL table
    df.createOrReplaceTempView("employees")

    # Perform Spark SQL queries on the DataFrame
    # Example 1: Get all rows where age is greater than 30
    result1 = spark.sql("SELECT * FROM employees WHERE age > 30")

    # Example 2: Calculate the average age
    result2 = spark.sql("SELECT AVG(age) AS average_age FROM employees")

    # Example 3: Group the data by gender and calculate the average age per gender
    result3 = spark.sql("SELECT gender, AVG(age) AS average_age FROM employees GROUP BY gender")

    # Print the query results
    print("Query Result 1:")
    result1.show()

    print("Query Result 2:")
    result2.show()

    print("Query Result 3:")
    result3.show()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    # Replace 'your_csv_file_path' with the actual path to your CSV file
    csv_file_path = 'your_csv_file_path'
    apply_spark_sql_queries(csv_file_path)

3. Spark Streaming:
  a) Write a Python program to create a Spark Streaming application.
   b) Configure the application to consume data from a streaming source (e.g., Kafka or a socket).
   c) Implement streaming transformations and actions to process and analyze the incoming data stream.


In [None]:
# a) Write a Python program to create a Spark Streaming application.

from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

def process_stream(stream):
    # Process each RDD in the stream
    for rdd in stream:
        if not rdd.isEmpty():
            print("New batch of data received:")
            # Print the data in the current RDD
            for record in rdd.collect():
                print(record)

def create_spark_streaming_app():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("SparkStreamingApp") \
        .getOrCreate()

    # Create a StreamingContext with a batch interval of 1 second
    ssc = StreamingContext(spark.sparkContext, 1)

    # Create a DStream from a TCP data source (example: localhost:9999)
    # Replace 'localhost' and '9999' with your own data source information
    stream = ssc.socketTextStream('localhost', 9999)

    # Process each batch of data received from the stream
    stream.foreachRDD(process_stream)

    # Start the Spark Streaming application
    ssc.start()

    # Wait for the application to terminate
    ssc.awaitTermination()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    create_spark_streaming_app()

#   b) Configure the application to consume data from a streaming source (e.g., Kafka or a socket).

# 1.) Using Kafka as the streaming source:

from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

def process_stream(stream):
    # Process each RDD in the stream
    for rdd in stream:
        if not rdd.isEmpty():
            print("New batch of data received:")
            # Print the data in the current RDD
            for record in rdd.collect():
                print(record)

def create_spark_streaming_app_kafka():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("SparkStreamingApp-Kafka") \
        .getOrCreate()

    # Create a StreamingContext with a batch interval of 1 second
    ssc = StreamingContext(spark.sparkContext, 1)

    # Define Kafka parameters
    kafka_params = {
        "bootstrap.servers": "your_kafka_broker_1:9092,your_kafka_broker_2:9092",  # Replace with Kafka broker addresses
        "group.id": "spark-streaming-consumer-group",  # Consumer group ID
        "auto.offset.reset": "latest",  # Start consuming from the latest offset
    }

    # Create a DStream from Kafka topic 'your_topic'
    kafka_stream = KafkaUtils.createDirectStream(
        ssc,
        ['your_topic'],  # Replace with the topic name
        kafkaParams=kafka_params
    )

    # Process each batch of data received from the Kafka stream
    kafka_stream.foreachRDD(process_stream)

    # Start the Spark Streaming application
    ssc.start()

    # Wait for the application to terminate
    ssc.awaitTermination()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    create_spark_streaming_app_kafka()

    
#  2.) Using a socket as the streaming source:


from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

def process_stream(stream):
    # Process each RDD in the stream
    for rdd in stream:
        if not rdd.isEmpty():
            print("New batch of data received:")
            # Print the data in the current RDD
            for record in rdd.collect():
                print(record)

def create_spark_streaming_app_socket():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("SparkStreamingApp-Socket") \
        .getOrCreate()

    # Create a StreamingContext with a batch interval of 1 second
    ssc = StreamingContext(spark.sparkContext, 1)

    # Create a DStream from the socket source on localhost:9999
    socket_stream = ssc.socketTextStream("localhost", 9999)

    # Process each batch of data received from the socket stream
    socket_stream.foreachRDD(process_stream)

    # Start the Spark Streaming application
    ssc.start()

    # Wait for the application to terminate
    ssc.awaitTermination()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    create_spark_streaming_app_socket()
    
    
#   c) Implement streaming transformations and actions to process and analyze the incoming data stream.
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

def process_stream(stream):
    # Process each RDD in the stream
    for rdd in stream:
        if not rdd.isEmpty():
            print("New batch of data received:")
            # Print the data in the current RDD
            for record in rdd.collect():
                print(record)

            # Perform streaming transformations and actions
            # Example: Filter the data to get records with age greater than 30
            rdd_filtered = rdd.filter(lambda record: record.get('age', 0) > 30)

            # Example: Group the data by gender and calculate the average age per gender
            rdd_grouped = rdd.groupBy(lambda record: record.get('gender', 'Unknown')).mapValues(
                lambda records: sum(record.get('age', 0) for record in records) / len(records)
            )

            # Print the results of streaming transformations and actions
            print("Filtered RDD:")
            for record in rdd_filtered.collect():
                print(record)

            print("Grouped RDD:")
            for key, value in rdd_grouped.collect():
                print(f"Gender: {key}, Average Age: {value}")

def create_spark_streaming_app_kafka():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("SparkStreamingApp-Kafka") \
        .getOrCreate()

    # Create a StreamingContext with a batch interval of 1 second
    ssc = StreamingContext(spark.sparkContext, 1)

    # Define Kafka parameters
    kafka_params = {
        "bootstrap.servers": "your_kafka_broker_1:9092,your_kafka_broker_2:9092",  # Replace with Kafka broker addresses
        "group.id": "spark-streaming-consumer-group",  # Consumer group ID
        "auto.offset.reset": "latest",  # Start consuming from the latest offset
    }

    # Create a DStream from Kafka topic 'your_topic'
    kafka_stream = KafkaUtils.createDirectStream(
        ssc,
        ['your_topic'],  # Replace with the topic name
        kafkaParams=kafka_params
    )

    # Process each batch of data received from the Kafka stream
    kafka_stream.foreachRDD(process_stream)

    # Start the Spark Streaming application
    ssc.start()

    # Wait for the application to terminate
    ssc.awaitTermination()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    create_spark_streaming_app_kafka()


4. Spark SQL and Data Source Integration:
   a) Write a Python program to connect Spark with a relational database (e.g., MySQL, PostgreSQL).
   b)Perform SQL operations on the data stored in the database using Spark SQL.
   c) Explore the integration capabilities of Spark with other data sources, such as Hadoop Distributed File System (HDFS) or Amazon S3.


In [None]:
# a) Write a Python program to connect Spark with a relational database (e.g., MySQL, PostgreSQL).

from pyspark.sql import SparkSession

def connect_spark_with_database():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("SparkWithDatabase") \
        .getOrCreate()

    # Replace the following with your database credentials and connection details
    db_url = "jdbc:postgresql://your_postgresql_host:your_port/your_database"
    db_properties = {
        "user": "your_username",
        "password": "your_password",
        "driver": "org.postgresql.Driver"  # For MySQL, use "com.mysql.jdbc.Driver"
    }

    # Define the table name in the database you want to read
    table_name = "your_table_name"

    # Load data from the database table into a DataFrame
    df = spark.read.jdbc(url=db_url, table=table_name, properties=db_properties)

    # Perform DataFrame operations or analysis as needed
    # For example, display the DataFrame content
    df.show()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    connect_spark_with_database()

    
#   b)Perform SQL operations on the data stored in the database using Spark SQL.

from pyspark.sql import SparkSession

def perform_sql_operations():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("SparkSQLOperations") \
        .getOrCreate()

    # Replace the following with your database credentials and connection details
    db_url = "jdbc:postgresql://your_postgresql_host:your_port/your_database"
    db_properties = {
        "user": "your_username",
        "password": "your_password",
        "driver": "org.postgresql.Driver"  # For MySQL, use "com.mysql.jdbc.Driver"
    }

    # Define the table name in the database you want to query
    table_name = "your_table_name"

    # Load data from the database table into a DataFrame
    df = spark.read.jdbc(url=db_url, table=table_name, properties=db_properties)

    # Register the DataFrame as a temporary SQL table
    df.createOrReplaceTempView("data_table")

    # Perform Spark SQL operations on the data
    # Example 1: Select all rows from the table
    result1 = spark.sql("SELECT * FROM data_table")

    # Example 2: Filter data using SQL WHERE clause
    result2 = spark.sql("SELECT * FROM data_table WHERE age > 30")

    # Example 3: Calculate the average age
    result3 = spark.sql("SELECT AVG(age) AS average_age FROM data_table")

    # Print the query results
    print("Query Result 1:")
    result1.show()

    print("Query Result 2:")
    result2.show()

    print("Query Result 3:")
    result3.show()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    perform_sql_operations()
    
# c) Explore the integration capabilities of Spark with other data sources, such as Hadoop Distributed File System (HDFS) or Amazon S3.

from pyspark.sql import SparkSession

def read_data_from_s3(s3_path):
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("SparkWithS3") \
        .config("spark.hadoop.fs.s3a.access.key", "your_aws_access_key") \
        .config("spark.hadoop.fs.s3a.secret.key", "your_aws_secret_key") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .getOrCreate()

    # Read data from Amazon S3 into a DataFrame
    df = spark.read.csv(s3_path, header=True, inferSchema=True)

    # Perform DataFrame operations or analysis as needed
    df.show()

    # Stop the SparkSession
    spark.stop()

if __name__ == "__main__":
    # Replace 'your_s3_file_path' with the actual S3 path to your CSV file
    s3_file_path = 's3a://your_bucket_name/your_s3_file_path'
    read_data_from_s3(s3_file_path)