# Validating Snowflake Tables Using PySpark and Great Expectations

**Example validations:**
  - Ensure columns are present in a table
  - Ensure values in a column are within a specified range or min/max, etc
  - Ensure that a table column has at least one value from a set of values
  - many more validations planned

In [1]:
from pathlib import Path
import configparser
import os
import pyspark
from great_expectations.dataset import SparkDFDataset
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
from typing import Iterator

## Custom helper functions to create pyspark session and create pyspark dataframe from Snowflake query

**NOTE:** These examples assume that we have stored Snowflake credentials in a `config.ini` text file whose location is specified in the `CONFIG_PATH` environment variable (using Windows: `set CONFIG_PATH=path/to/config.ini` or using Linux/MacOS: `export CONFIG_PATH=paht/to/config.ini`).  Furthermore, it is assumed you have access to a PySpark environment that is capable of connecting to your Snowflake environment: has the proper JDBC driver and connector jar files registered which are also referenced in the `config.ini` file.

In [2]:
def get_spark_snowflake_session():
    config_file = os.getenv("CONFIG_PATH")
    
    config = configparser.ConfigParser()
    try:
        config.read(config_file)
    except ConfigFileNotFound:
        print("config.ini file not found")

    sf_jdbc_driver = config['snowflake']['jdbc_driver_path']
    sf_connector = config['snowflake']['spark_driver_path']

    return (
        SparkSession.builder.master("local[*]")
        .appName("Snowflake_JDBC")
        .config("spark.jars", f"{sf_jdbc_driver},{sf_connector}")
        .getOrCreate()
    )

In [3]:
def get_spark_snowflake_dataframe_from_sql(session: SparkSession, schema: str, sql: str) -> DataFrame:
    config_file = os.getenv("CONFIG_PATH")
    
    config = configparser.ConfigParser()
    try:
        config.read(config_file)
    except ConfigFileNotFound:
        print("config.ini file not found")

    sf_account = config['snowflake']['account']
    sf_user = config['snowflake']['username']
    sf_database = config['snowflake']['database']
    # sf_schema = config['snowflake']['schema']
    sf_role = config['snowflake']['role']
    sf_warehouse = config['snowflake']['warehouse']
    sf_authenticator = config['snowflake']['authenticator']

    # Snowflake connection parameters
    sfparams = {
        "sfURL" : f"{sf_account}.snowflakecomputing.com",
        "sfUser" : sf_user,
        "sfPassword" : "your_password",  # Not applicable when using externalbrowser authenticator
        "sfDatabase" : sf_database,
        "sfSchema" : schema,
        "sfRole" : sf_role,
        "sfWarehouse" : sf_warehouse,
        "sfAuthenticator" : sf_authenticator
    }

    return (
        spark.read.format('net.snowflake.spark.snowflake')
        .options(**sfparams)
        .option("query", sql)
        .load()
    )

## Custom tests to validate NHTSA tables

In [4]:
def test_mandatory_columns_existence(df: SparkDFDataset, columns: Iterator):
    for column in columns:
        try:
            assert df.expect_column_to_exist(column).success, f"ERROR: Mandatory column '{column}' does not exist"
            print(f"Column '{column}' exists : PASSED")
        except AssertionError as e:
            print(e)

In [5]:
def test_year_min_max(df: SparkDFDataset, year_column: str, start_year: int, end_year: int):
    try:
        assert df.expect_column_values_to_be_between(year_column, start_year, end_year).success, f"ERROR: Failed min/max YEAR test"
        print(f"PASSED min/max YEAR test")
    except AssertionError as e:
        print(e)

In [6]:
def test_make_names(df: SparkDFDataset, make_name_column: str, make_names=Iterator):
    try:
        assert df.expect_column_values_to_be_in_set(make_name_column, value_set=make_names).success, f"ERROR: Failed make names test"
        print(f"PASSED make names test")
    except AssertionError as e:
        print(e)

## `main()` routine

In [7]:
if __name__ == "__main__":
    spark = get_spark_snowflake_session()

    sql = """
        SELECT * from nhtsa.model_names where vehicletypename != 'Motorcycle'
    """

    sdf = get_spark_snowflake_dataframe_from_sql(session=spark, schema='nhtsa', sql=sql)

    test_df = SparkDFDataset(sdf)

    #  Run Great Expectations test suite
    MANDATORY_COLUMNS = [
        'MAKE_ID',
        'MAKE_NAME',
        'MODEL_ID',
        'MODEL_NAME',
        'VEHICLETYPEID',
        'VEHICLETYPENAME',
        'YEAR',
        'CREATED_DATE'
    ]

    MAKE_NAMES = [
        'ACURA',
        'ALFA ROMEO',
        'ASTON MARTIN',
        'AUDI',
        'BENTLEY',
        'BMW',
        'BUGATTI',
        'BUICK',
        'CADILLAC',
        'CHEVROLET',
        'CHRYSLER',
        'DODGE',
        'FERRARI',
        'FIAT',
        'FORD',
        'GENESIS',
        'GMC',
        'HONDA',
        'HUMMER',
        'HYUNDAI',
        'INFINITI',
        'ISUZU',
        'JAGUAR',
        'JEEP',
        'KIA',
        'LAMBORGHINI',
        'LANCIA',
        'LAND ROVER',
        'LEXUS',
        'LINCOLN',
        'LOTUS',
        'LUCID',
        'MASERATI',
        'MAZDA',
        'MCLAREN',
        'MERCEDES-BENZ',
        'MERCURY',
        'MINI',
        'MITSUBISHI',
        'NISSAN',
        'OPEL',
        'PLYMOUTH',
        'POLESTAR',
        'PONTIAC',
        'PORSCHE',
        'RAM',
        'RIVIAN',
        'ROLLS ROYCE',
        'SAAB',
        'SATURN',
        'SMART',
        'SUBARU',
        'SUZUKI',
        'TESLA',
        'TOYOTA',
        'VOLKSWAGEN',
        'VOLVO'
    ]

    print("###########################################################################")
    print("Performing validation tests...")
    test_mandatory_columns_existence(test_df, columns=MANDATORY_COLUMNS)
    test_year_min_max(test_df, year_column='YEAR', start_year=2009, end_year=2023)
    test_make_names(test_df, make_name_column='MAKE_NAME', make_names=MAKE_NAMES)

    print("Finished validation tests")

    spark.stop()

23/06/27 08:56:17 WARN Utils: Your hostname, VA-rveOJ44nPxI1 resolves to a loopback address: 127.0.1.1; using 192.168.56.1 instead (on interface eth1)
23/06/27 08:56:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/06/27 08:56:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
###########################################################################
Performing validation tests...
Column 'MAKE_ID' exists : PASSED
Column 'MAKE_NAME' exists : PASSED
Column 'MODEL_ID' exists : PASSED
Column 'MODEL_NAME' exists : PASSED
Column 'VEHICLETYPEID' exists : PASSED
Column 'VEHICLETYPENAME' exists : PASSED
Column 'YEAR' exists : PASSED
Column 'CREATED_DATE' exists : PASSED


                                                                                

PASSED min/max YEAR test
PASSED make names test
Finished validation tests


23/06/27 08:56:50 WARN SparkConnectorContext$: Finish cancelling all queries for local-1687870579371
