%md
# CI/CD Test Data Generator  
This notebook generates sample employee data for testing ETL and CI/CD pipelines in Databricks.


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import SparkSession

# Create Spark session (usually pre-created in Databricks notebooks)
spark = SparkSession.builder.appName("TestFakeData").getOrCreate()

# Define schema for sample data
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True),
    StructField("salary", FloatType(), True),
])

# Create sample data
data = [
    (1, "Alice", 29, "Engineering", 85000.0),
    (2, "Bob", 35, "Marketing", 65000.0),
    (3, "Charlie", 40, "Finance", 70000.0),
    (4, "Diana", 23, "Engineering", 90000.0),
    (5, "Evan", 31, "HR", 60000.0),
]

# Create DataFrame with fake data
df = spark.createDataFrame(data, schema=schema)

# Register DataFrame as a temp view for SQL testing
df.createOrReplaceTempView("employees")

# Show the data
df.show()

# Example query to test CI/CD
result = spark.sql("""
    SELECT department, AVG(salary) AS avg_salary, COUNT(*) AS employee_count
    FROM employees
    GROUP BY department
""")

result.show()


Adding new data to test notebook

In [0]:
from pyspark.sql.functions import col

# Define test function to validate avg salary per department results
def test_avg_salary(df):
    # Expected average salary for Engineering
    eng_avg_salary = df.filter(col("department") == "Engineering").select("avg_salary").collect()[0][0]
    assert abs(eng_avg_salary - 87500.0) < 0.01, f"Engineering avg salary test failed: {eng_avg_salary}"

    # Expected employee count for HR
    hr_count = df.filter(col("department") == "HR").select("employee_count").collect()[0][0]
    assert hr_count == 1, f"HR employee count test failed: {hr_count}"

    print("All tests passed!")

# Run your existing SQL query
result = spark.sql("""
    SELECT department, AVG(salary) AS avg_salary, COUNT(*) AS employee_count
    FROM employees
    GROUP BY department
""")

# Show the result for visibility
result.show()

# Run the test validation
test_avg_salary(result)
