# Configuration

In [None]:
from pyspark.sql import SparkSession

from sparkenforce import Dataset, DatasetValidationError, validate

# Create SparkSession
spark = SparkSession.builder.master("local[1]").appName("test").getOrCreate()

In [2]:
data = [
    ("James", "", "Smith", "1991-04-01", 3000),
    ("Michael", "Rose", "", "2000-05-19", 4000),
    ("Robert", "", "Williams", "1978-09-05", 4000),
    ("Maria", "Anne", "Jones", "1967-12-01", 4000),
    ("Jen", "Mary", "Brown", "1980-02-17", -1),
]

columns = ["firstname", "middlename", "lastname", "dob", "salary"]

In [3]:
df = spark.createDataFrame(data=data, schema=columns)

df.printSchema()

df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- salary: long (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+---------+----------+--------+----------+------+
|firstname|middlename|lastname|       dob|salary|
+---------+----------+--------+----------+------+
|    James|          |   Smith|1991-04-01|  3000|
|  Michael|      Rose|        |2000-05-19|  4000|
|   Robert|          |Williams|1978-09-05|  4000|
|    Maria|      Anne|   Jones|1967-12-01|  4000|
|      Jen|      Mary|   Brown|1980-02-17|    -1|
+---------+----------+--------+----------+------+



                                                                                

# Return Value Validation Demonstration

This new functionality allows validating that the value returned by a function matches the specified Dataset type annotation.


In [None]:
# Example 1: Successful return value validation
from pyspark.sql import functions as fn


@validate
def transform_data(
    df: Dataset["firstname":str, ...],
) -> Dataset["name":str, "length":int]:
    """Function that validates both input and output."""
    return df.select(df.firstname.alias("name"), fn.length(df.firstname).alias("length"))


# This should work correctly
try:
    result = transform_data(df)
    print("✅ Validation successful!")
    result.show()
except DatasetValidationError as e:
    print(f"❌ Error: {e}")

✅ Validation successful!
+-------+------+
|   name|length|
+-------+------+
|  James|     5|
|Michael|     7|
| Robert|     6|
|  Maria|     5|
|    Jen|     3|
+-------+------+



                                                                                

In [5]:
# Example 2: Error due to incorrect schema in return value
@validate
def incorrect_return_schema(
    df: Dataset["firstname":str, ...],
) -> Dataset["name":str, "length":int]:
    """Function that returns an incorrect schema."""
    return df.select("firstname", "lastname")  # Incorrect columns


# This should fail
try:
    result = incorrect_return_schema(df)
    print("❌ Should not reach here")
except DatasetValidationError as e:
    print("✅ Expected error in return validation:")
    print(e)

✅ Expected error in return validation:
return value columns mismatch. Expected exactly {'length', 'name'}, got {'firstname', 'lastname'}. missing columns: {'name', 'length'}, unexpected columns: {'firstname', 'lastname'}


In [6]:
# Example 3: Error due to returning incorrect type (not DataFrame)
@validate
def non_dataframe_return(df: Dataset["firstname":str, ...]) -> Dataset["result":str]:
    """Function that returns something that is not a DataFrame."""
    return "Not a DataFrame"


# This should fail
try:
    result = non_dataframe_return(df)
    print("❌ Should not reach here")
except DatasetValidationError as e:
    print("✅ Expected error - not DataFrame:")
    print(e)

✅ Expected error - not DataFrame:
return value must be a PySpark DataFrame, got <class 'str'>


In [7]:
# Example 4: Backward compatibility - no return annotation
@validate
def no_return_annotation(df: Dataset["firstname":str, ...]):
    """Function without return annotation - return is not validated."""
    return "Can return anything"


@validate
def explicit_none_return(df: Dataset["firstname":str, ...]) -> None:
    """Function with explicit None return - not validated."""
    return 42


# Both should work without return validation
try:
    result1 = no_return_annotation(df)
    result2 = explicit_none_return(df)
    print(f"✅ No annotation: {result1}")
    print(f"✅ With explicit None: {result2}")
except Exception as e:
    print(f"❌ Unexpected error: {e}")

✅ No annotation: Can return anything
✅ With explicit None: 42


In [None]:
# Example 5: Validation with ellipsis (minimum columns)
@validate
def ellipsis_return_example(
    df: Dataset["firstname":str, ...],
) -> Dataset["firstname":str, "summary":str, ...]:
    """Function that allows additional columns in the return."""
    # Add additional columns (allowed with ellipsis)
    return df.select(
        "firstname",
        fn.lit("processed").alias("summary"),
        "lastname",  # Additional column allowed
        "salary",  # Another additional column allowed
    )


# This should work correctly
try:
    result = ellipsis_return_example(df)
    print("✅ Ellipsis validation successful!")
    result.select("firstname", "summary", "lastname").show(3)
except DatasetValidationError as e:
    print(f"❌ Error: {e}")

✅ Ellipsis validation successful!
+---------+---------+--------+
|firstname|  summary|lastname|
+---------+---------+--------+
|    James|processed|   Smith|
|  Michael|processed|        |
|   Robert|processed|Williams|
+---------+---------+--------+
only showing top 3 rows


                                                                                