In [1]:
%scala
import com.databricks.labs.validation.utils.Structures._
import com.databricks.labs.validation._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Column, DataFrame}

In [2]:
%scala
object Lookups {
  final val validStoreIDs = Array(1001, 1002)
  final val validRegions = Array("Northeast", "Southeast", "Midwest", "Northwest", "Southcentral", "Southwest")
  final val validSkus = Array(123456, 122987,123256, 173544, 163212, 365423, 168212)
}

val df = sc.parallelize(Seq(
    ("Northwest", 1001, 123456, 9.32, 8.99, 4.23, "2020-02-01 00:00:00.000"),
    ("Northwest", 1001, 123256, 19.99, 16.49, 12.99, "2020-02-01"),
    ("Northwest", 1001, 123456, 0.99, 0.99, 0.10, "2020-02-01"),
    ("Northwest", 1001, 123456, 0.98, 0.90, 0.10, "2020-02-01"), // non_distinct sku
    ("Northwst", 1001, 123456, 0.99, 0.99, 0.10, "2020-02-01"), // Misspelled Region
    ("Northwest", 1002, 122987, 9.99, 9.49, 6.49, "2021-02-01"), // Invalid Date/Timestamp
    ("Northwest", 1002, 173544, 1.29, 0.99, 1.23, "2020-02-01"),
    ("Northwest", 1002, 168212, 3.29, 1.99, 1.23, "2020-02-01"),
    ("Northwest", 1002, 365423, 1.29, 0.99, 1.23, "2020-02-01"),
    ("Northwest", 1002, 3897615, 14.99, 129.99, 1.23, "2020-02-01"),
    ("Northwest", 1003, 163212, 3.29, 1.99, 1.23, "2020-02-01") // Invalid numeric store_id groupby test
  )).toDF("region", "store_id", "sku", "retail_price", "scan_price", "cost", "create_ts")
    .withColumn("create_ts", 'create_ts.cast("timestamp"))
    .withColumn("create_dt", 'create_ts.cast("date"))

In [3]:
%scala
display(df)

region,store_id,sku,retail_price,scan_price,cost,create_ts,create_dt
Northwest,1001,123456,9.32,8.99,4.23,2020-02-01T00:00:00.000+0000,2020-02-01
Northwest,1001,123256,19.99,16.49,12.99,2020-02-01T00:00:00.000+0000,2020-02-01
Northwest,1001,123456,0.99,0.99,0.1,2020-02-01T00:00:00.000+0000,2020-02-01
Northwest,1001,123456,0.98,0.9,0.1,2020-02-01T00:00:00.000+0000,2020-02-01
Northwst,1001,123456,0.99,0.99,0.1,2020-02-01T00:00:00.000+0000,2020-02-01
Northwest,1002,122987,9.99,9.49,6.49,2021-02-01T00:00:00.000+0000,2021-02-01
Northwest,1002,173544,1.29,0.99,1.23,2020-02-01T00:00:00.000+0000,2020-02-01
Northwest,1002,168212,3.29,1.99,1.23,2020-02-01T00:00:00.000+0000,2020-02-01
Northwest,1002,365423,1.29,0.99,1.23,2020-02-01T00:00:00.000+0000,2020-02-01
Northwest,1002,3897615,14.99,129.99,1.23,2020-02-01T00:00:00.000+0000,2020-02-01


# Build Rules

In [5]:
%scala
def getDiscountPercentage(retailPrice: Column, scanPrice: Column): Column = {
  (retailPrice - scanPrice) / retailPrice
}

// Example of standalone simple rule
val validateRetailPrice = Rule("Retail_Price_Validation", col("retail_price"), Bounds(0.0, 6.99))

// Example of Array of rules of different types
val specializedRules = Array(
  // Example of aggregate column
  Rule("Reasonable_sku_counts", count(col("sku")), Bounds(lower = 20.0, upper = 200.0)),
  // Example of calculated column from optimized UDF
  Rule("Max_allowed_discount",
    max(getDiscountPercentage(col("retail_price"), col("scan_price"))),
    Bounds(upper = 90.0)),
  // Example distinct values rule
  Rule("Unique_Skus", countDistinct("sku"), Bounds(upper = 1.0))
)

In [6]:
%scala
// It's common to generate many min/max boundaries. These can be generated easily
// The generator function can easily be extended or overridden to satisfy more complex requirements
val minMaxPriceDefs = Array(
  MinMaxRuleDef("MinMax_Sku_Price", col("retail_price"), Bounds(0.0, 29.99)),
  MinMaxRuleDef("MinMax_Scan_Price", col("scan_price"), Bounds(0.0, 29.99)),
  MinMaxRuleDef("MinMax_Cost", col("cost"), Bounds(0.0, 12.0))
)

// Generate the array of Rules from the minmax generator
val minMaxPriceRules = RuleSet.generateMinMaxRules(minMaxPriceDefs: _*)

In [7]:
%scala
// Numerical categorical rules. Build create a list of values to be validated against.
val catNumerics = Array(
  // Only allow store_ids in my validStoreIDs lookup
  Rule("Valid_Stores", col("store_id"), Lookups.validStoreIDs),
  // Validate against a pre-built list of skus that have been verified to be accurate
  // Currently this is manually created for demo but can easily be created from a dataframe, etc.
  Rule("Valid_Skus", col("sku"), Lookups.validSkus)
)

// Validate strings as well as numericals. They don't need to be in a separate array, it's just done here for demonstration
val catStrings = Array(
  Rule("Valid_Regions", col("region"), Lookups.validRegions)
)

# Validate DataFrame
Dataframes can be simple or a Seq of columns can be passed in as "bys" for the DataFrame to be grouped by. <br>
If the dataframe is grouped validations will be per group

In [9]:
%scala
// The Validate function will return two values, a report DF to review failures and `passed` as a boolean
// If the `passed` == true then all rules passed successfully, otherwise, at least one rule failed.

// Notice the builder patthern. The idea is to buld up your rules and then add them to your RuleSet[s].
// RuleSets can be combined to using the RuleSet.add(ruleSet: RuleSet) method
val (rulesReport, passed) = RuleSet(df)
  .add(specializedRules)
  .add(minMaxPriceRules)
  .add(catNumerics)
  .add(catStrings)
  .validate()

In [10]:
%scala
passed

In [11]:
%scala
display(rulesReport)

Rule_Name,Rule_Type,Validation_Values,Invalid_Count,Failed
MinMax_Cost_max,bounds,"List(null, null, List(0.0, 12.0), null)",1,True
MinMax_Scan_Price_max,bounds,"List(null, null, List(0.0, 29.99), null)",1,True
Reasonable_sku_counts,bounds,"List(null, null, List(20.0, 200.0), null)",1,True
Unique_Skus,bounds,"List(null, null, List(-Infinity, 1.0), null)",1,True
Valid_Regions,validStrings,"List(null, null, null, List(Northeast, Southeast, Midwest, Northwest, Southcentral, Southwest))",1,True
Valid_Skus,validNumerics,"List(null, List(123456.0, 122987.0, 123256.0, 173544.0, 163212.0, 365423.0, 168212.0), null, null)",1,True
Valid_Stores,validNumerics,"List(null, List(1001.0, 1002.0), null, null)",1,True
Max_allowed_discount,bounds,"List(null, null, List(-Infinity, 90.0), null)",0,False
MinMax_Cost_min,bounds,"List(null, null, List(0.0, 12.0), null)",0,False
MinMax_Scan_Price_min,bounds,"List(null, null, List(0.0, 29.99), null)",0,False


In [12]:
%scala
// Grouped Dataframe
// Let's assume we want to perform validation by some grouping of one or many columns
val (rulesReport, passed) = RuleSet(df, Array("store_id"))
  .add(specializedRules)
  .add(minMaxPriceRules)
  .add(catNumerics)
  .add(catStrings)
  .validate()

In [13]:
%scala
val (rulesReport, passed) = RuleSet(df, "store_id")
  .add(specializedRules)
  .add(minMaxPriceRules)
  .add(catNumerics)
  .add(catStrings)
  .validate()

## Date Validations
To be implemented.