# **SQL Count Alert Demo in Databricks**

1. Create Source and Target Tables

In [0]:
%sql
DROP TABLE IF EXISTS demo_source;
DROP TABLE IF EXISTS demo_target;

CREATE OR REPLACE TABLE demo_source (
  id INT,
  name STRING,
  value INT
);

INSERT OVERWRITE demo_source VALUES (1, 'A', 100), (2, 'B', 200);

CREATE OR REPLACE TABLE demo_target (
  id INT,
  name STRING,
  value INT
);

INSERT OVERWRITE demo_target SELECT * FROM demo_source;


In [0]:
%sql
SELECT 
  column_name,
  source_count,
  target_count,
  CASE WHEN source_count != target_count THEN 1 ELSE 0 END AS mismatch_flag
FROM (
  SELECT 'id' AS column_name,
    (SELECT COUNT(id) FROM demo_source) AS source_count,
    (SELECT COUNT(id) FROM demo_target) AS target_count
  UNION ALL
  SELECT 'name' AS column_name,
    (SELECT COUNT(name) FROM demo_source),
    (SELECT COUNT(name) FROM demo_target)
  UNION ALL
  SELECT 'value' AS column_name,
    (SELECT COUNT(value) FROM demo_source),
    (SELECT COUNT(value) FROM demo_target)
) t

In [0]:
pip install faker

2. Create and Parameterize Pipeline Notebook with Conditional Data Load

In [0]:
from faker import Faker
import random
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


run_count = int(dbutils.widgets.get("run_count"))


fake = Faker()


schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("value", IntegerType(), False)
])


def generate_fake_data(num_rows=2):
    data = []
    for i in range(num_rows):
        row = (
            i + 1,
            fake.first_name(),
            random.randint(50, 300)
        )
        data.append(row)
    return spark.createDataFrame(data, schema=schema)


# Generate new fake data each run
source_df = generate_fake_data()

# Append to source table every run
source_df.write.mode("append").saveAsTable("demo_source")
display(source_df)


if run_count in [1, 2, 4]:
    # Append to target on runs 1, 2, and 4
    source_df.write.mode("append").saveAsTable("demo_target")
    display(source_df)
elif run_count == 3:
    # Skip target load on run 3 to simulate mismatch
    pass


Source to Target Count Query

In [0]:
%sql
SELECT 
  column_name,
  source_count,
  target_count,
  CASE WHEN source_count != target_count THEN 1 ELSE 0 END AS mismatch_flag
FROM (
  SELECT 'id' AS column_name,
    (SELECT COUNT(id) FROM demo_source) AS source_count,
    (SELECT COUNT(id) FROM demo_target) AS target_count
  UNION ALL
  SELECT 'name' AS column_name,
    (SELECT COUNT(name) FROM demo_source),
    (SELECT COUNT(name) FROM demo_target)
  UNION ALL
  SELECT 'value' AS column_name,
    (SELECT COUNT(value) FROM demo_source),
    (SELECT COUNT(value) FROM demo_target)
) t
