### Task 1: Handling Schema Mismatches using Spark
**Description**: Use Apache Spark to address schema mismatches by transforming data to match
the expected schema.

**Steps**:
1. Create Spark session
2. Load dataframe
3. Define the expected schema
4. Handle schema mismatches
5. Show corrected data

In [None]:
# Write your code from here

### Task 2: Detect and Correct Incomplete Data in ETL
**Description**: Use Python and Pandas to detect incomplete data in an ETL process and fill
missing values with estimates.

**Steps**:
1. Detect incomplete data
2. Fill missing values
3. Report changes

In [None]:
# Write your code from here

In [1]:
# This script demonstrates how to handle schema mismatches using Apache Spark.

# To run this script, you need to have Apache Spark installed.
# If you don't have it, you can install pyspark using: pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, lit

# Task 1: Handling Schema Mismatches using Spark
# Description: Use Apache Spark to address schema mismatches by transforming data
#              to match the expected schema.

# Step 1: Create Spark session
# This is the entry point for using Spark functionality.
try:
    spark = SparkSession.builder \
        .appName("SchemaMismatchHandler") \
        .getOrCreate()
    print("Step 1: Spark Session created successfully.")

    # Step 2: Load dataframe (simulated)
    # We'll create a dummy DataFrame with some schema inconsistencies.
    # For example, 'age' might come in as a string, or 'salary' as an integer when float is expected.
    data = [
        (1, "Alice", "30", "New York", "50000.0"),
        (2, "Bob", "25", "Los Angeles", "60000"),
        (3, "Charlie", "thirty", "Chicago", "75000.50"), # 'thirty' is a mismatch for IntegerType
        (4, "David", "35", "Houston", "80000"),
        (5, "Eve", "28", "Miami", "invalid_salary") # 'invalid_salary' is a mismatch for DoubleType
    ]
    # Define a preliminary schema for the raw data to load it into Spark
    raw_schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("age", StringType(), True), # Intentionally StringType to simulate mismatch
        StructField("city", StringType(), True),
        StructField("salary", StringType(), True) # Intentionally StringType to simulate mismatch
    ])
    df = spark.createDataFrame(data, schema=raw_schema)
    print("\nStep 2: Original DataFrame loaded.")
    df.printSchema()
    df.show()

    # Step 3: Define the expected schema
    # This is the target schema we want our data to conform to.
    expected_schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("city", StringType(), True),
        StructField("salary", DoubleType(), True)
    ])
    print("\nStep 3: Expected Schema defined.")
    print(expected_schema)

    # Step 4: Handle schema mismatches
    # Iterate through the expected schema and cast columns.
    # If a cast fails (e.g., "thirty" to Integer), Spark will typically put null.
    # We can also add more robust error handling or default values if needed.
    corrected_df = df
    for field in expected_schema.fields:
        column_name = field.name
        expected_type = field.dataType

        # Check if the column exists in the DataFrame
        if column_name in corrected_df.columns:
            # Cast the column to the expected type
            # .cast() handles type conversions. If conversion fails, it results in null.
            corrected_df = corrected_df.withColumn(column_name, col(column_name).cast(expected_type))
        else:
            # If a column is missing from the source data but is in the expected schema,
            # add it with null values and the expected type.
            print(f"Warning: Column '{column_name}' missing in source data. Adding with nulls.")
            corrected_df = corrected_df.withColumn(column_name, lit(None).cast(expected_type))

    # Step 5: Show corrected data
    print("\nStep 4 & 5: Schema mismatches handled and corrected data shown.")
    print("\nCorrected DataFrame Schema:")
    corrected_df.printSchema()
    print("\nCorrected DataFrame Data:")
    corrected_df.show()

except Exception as e:
    print(f"An error occurred during Spark schema mismatch handling: {e}")
finally:
    # Stop the Spark session if it was created
    if 'spark' in locals() and spark:
        spark.stop()
        print("\nSpark Session stopped.")



ModuleNotFoundError: No module named 'pyspark'