# 1. Working with RDDs:
   a) Write a Python program to create an RDD from a local data source.
   b) Implement transformations and actions on the RDD to perform data processing tasks.
   c) Analyze and manipulate data using RDD operations such as map, filter, reduce, or aggregate.


In [None]:
from pyspark import SparkContext

def create_rdd_from_local_data():
    # Create SparkContext
    sc = SparkContext(appName="RDDExample")

    # Create RDD from local data source
    data = [1, 2, 3, 4, 5]
    rdd = sc.parallelize(data)

    return rdd

def process_rdd(rdd):
    # Perform transformations and actions on the RDD
    # Example operations: map, filter, reduce, aggregate, etc.

    # Map operation: Multiply each element by 2
    mapped_rdd = rdd.map(lambda x: x * 2)

    # Filter operation: Keep only even numbers
    filtered_rdd = mapped_rdd.filter(lambda x: x % 2 == 0)

    # Reduce operation: Sum all elements
    total_sum = filtered_rdd.reduce(lambda a, b: a + b)

    return total_sum

# Create RDD from local data source
rdd = create_rdd_from_local_data()

# Process RDD and perform data processing tasks
result = process_rdd(rdd)

# Print the result
print("Result:", result)


# 2. Spark DataFrame Operations:
   a) Write a Python program to load a CSV file into a Spark DataFrame.
   b)Perform common DataFrame operations such as filtering, grouping, or joining.
   c) Apply Spark SQL queries on the DataFrame to extract insights from the data.


In [None]:
from pyspark.sql import SparkSession

def load_csv_into_dataframe():
    # Create SparkSession
    spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()

    # Load CSV file into DataFrame
    df = spark.read.csv("path_to_csv_file.csv", header=True, inferSchema=True)

    return df

def perform_dataframe_operations(df):
    # Perform common DataFrame operations

    # Filter operation: Keep rows where age is greater than 30
    filtered_df = df.filter(df.age > 30)

    # GroupBy operation: Group by gender and calculate average salary
    grouped_df = df.groupBy("gender").avg("salary")

    # Join operation: Join with another DataFrame on a common column
    other_df = spark.read.csv("other_csv_file.csv", header=True, inferSchema=True)
    joined_df = df.join(other_df, on="id", how="inner")

    return filtered_df, grouped_df, joined_df

def apply_spark_sql_queries(df):
    # Register DataFrame as a temporary table
    df.createOrReplaceTempView("my_table")

    # Apply Spark SQL queries on the DataFrame
    result = spark.sql("SELECT * FROM my_table WHERE age > 30")

    return result

# Load CSV file into DataFrame
df = load_csv_into_dataframe()

# Perform DataFrame operations
filtered_df, grouped_df, joined_df = perform_dataframe_operations(df)

# Apply Spark SQL queries
sql_result = apply_spark_sql_queries(df)

# Show results
filtered_df.show()
grouped_df.show()
joined_df.show()
sql_result.show()


# 3. Spark Streaming:
  a) Write a Python program to create a Spark Streaming application.
   b) Configure the application to consume data from a streaming source (e.g., Kafka or a socket).
   c) Implement streaming transformations and actions to process and analyze the incoming data stream.


In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

def process_stream(stream):
    # Perform streaming transformations and actions

    # Split each line into words
    words = stream.flatMap(lambda line: line.split(" "))

    # Count each word in the stream
    word_counts = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

    # Print word counts
    word_counts.pprint()

# Create SparkContext with a local master
sc = SparkContext(appName="StreamingExample")
sc.setLogLevel("ERROR")

# Create StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Create a DStream by connecting to a socket (localhost:9999)
stream = ssc.socketTextStream("localhost", 9999)

# Process the stream
process_stream(stream)

# Start the streaming context
ssc.start()
ssc.awaitTermination()


# 4. Spark SQL and Data Source Integration:
   a) Write a Python program to connect Spark with a relational database (e.g., MySQL, PostgreSQL).
   b)Perform SQL operations on the data stored in the database using Spark SQL.
   c) Explore the integration capabilities of Spark with other data sources, such as Hadoop Distributed File System (HDFS) or Amazon S3


In [None]:
from pyspark.sql import SparkSession

def connect_to_database():
    # Create SparkSession
    spark = SparkSession.builder.appName("DatabaseIntegration").getOrCreate()

    # Connect Spark with a relational database
    jdbc_url = "jdbc:postgresql://localhost:5432/mydatabase"
    properties = {
        "user": "your_username",
        "password": "your_password",
        "driver": "org.postgresql.Driver"
    }

    # Read data from a database table using Spark SQL
    df = spark.read.jdbc(url=jdbc_url, table="mytable", properties=properties)

    return df

def perform_sql_operations(df):
    # Perform SQL operations using Spark SQL

    # Register DataFrame as a temporary table
    df.createOrReplaceTempView("my_table")

    # Perform Spark SQL queries on the DataFrame
    result = spark.sql("SELECT * FROM my_table WHERE age > 30")

    return result

def explore_data_source_integration():
    # Load data from HDFS or Amazon S3 using Spark

    # Load data from HDFS
    hdfs_path = "hdfs://localhost:9000/path/to/data"
    hdfs_df = spark.read.csv(hdfs_path, header=True, inferSchema=True)

    # Load data from Amazon S3
    s3_path = "s3a://your_bucket/path/to/data"
    s3_df = spark.read.csv(s3_path, header=True, inferSchema=True)

    # Perform further operations on the loaded data

    # ...

# Connect Spark with a relational database
df = connect_to_database()

# Perform SQL operations on the data stored in the database
sql_result = perform_sql_operations(df)

# Explore integration capabilities with other data sources
explore_data_source_integration()
