In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp, col, when, collect_list, sha2, concat_ws, collect_set,udf,monotonically_increasing_id
from pydeequ.verification import *
import pydeequ
from pydeequ.checks import *

In [2]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

In [3]:
dataframe = spark.read.format('csv').load(r'..\data\merchant.csv',header=True,inferSchema=True)

In [4]:
dataframe.show()

+-----------+--------------------+------------+-----------+--------------------+--------------------+
|merchant_id|        company_name|  contact_no|      state|          created_at|          updated_at|
+-----------+--------------------+------------+-----------+--------------------+--------------------+
|     178307|            MSI Kart|  9319919988|trial_ended|2021-01-02 09:33:...|2021-01-02 09:39:...|
|     177863|  Bellelyse Boutique|000-000-0000|  suspended|2020-12-31 04:56:...|2021-02-19 16:53:...|
|     144524|     Rachid hamzaoui|  0639756718|       free|2020-09-21 17:18:...|2020-10-22 11:15:...|
|      50742|Thenaricalicollec...|  5049394269|  suspended|2019-03-22 03:57:...|2019-05-15 00:54:...|
|     176321|   CLICK MY CART LTD|  6479863690|  suspended|2020-12-23 06:20:...|2020-12-24 04:53:...|
|     140294|              myself|  7149259700|       free|2020-09-05 17:41:...|2020-10-06 11:15:...|
|     165163|  JeeJee’s Boutique |208-202-9314|trial_ended|2020-11-20 16:31:...|20

In [5]:
analysisResult = AnalysisRunner(spark) \
                    .onData(dataframe) \
                    .addAnalyzer(Size()) \
                    .run()
                    
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()

+-------+--------+----+-----+
| entity|instance|name|value|
+-------+--------+----+-----+
|Dataset|       *|Size| 10.0|
+-------+--------+----+-----+



In [6]:
check = Check(spark, CheckLevel.Error, "Sellbrite checks")

Verifying_Checks = (VerificationSuite(spark)
    .onData(dataframe)
    .addCheck(check.isComplete('merchant_id')))

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
check.isComplete('merchant_id')

In [None]:
class Pydeequ_Check_Verification:

    def check_fileformat_generate_dataset(self, sections):
        dataframe = spark.read.format('csv').load(r'..\data\merchant.csv',header=True,inferSchema=True)
        dataframe.show(truncate=False)
        check_result_dataframe = self.verify_checks_on_datasets(dataframe, sections)
        return check_result_dataframe

    def verify_checks_on_datasets(self, tables, datasets):
        dataset_id = config_df.filter(config_df.Table_name == datasets).select('dataset_id').first()[0]
        checker = config_df.filter(config_df.Table_name == datasets).select(collect_list('check')).first()[0]
        dq_verification_config_id = config_df.filter(config_df.Table_name == datasets).select(collect_list('dq_verification_config_id')).first()[0]
        
        check = Check(spark, CheckLevel.Error, "Sellbrite checks")
        checker.insert(0, "check")
        checks = "".join(checker)

        Verifying_Checks = (VerificationSuite(spark)
                            .onData(tables)
                            .addCheck(eval(checks))
                            .run())
                            
        Check_Reports = VerificationResult.checkResultsAsDataFrame(spark, Verifying_Checks)
        VerificationResult.successMetricsAsDataFrame(spark, Verifying_Checks).show()
        Check_Reports = Check_Reports.repartition(1).withColumn("dq_verification_config_id", udf(lambda id: dq_verification_config_id[id])(monotonically_increasing_id()))
        Check_Reports.show()
        
        Check_Reports_Dataframe = (Check_Reports.withColumn('create_utc_ts', lit(current_timestamp()))
                                   .withColumn('dataset_id', lit(dataset_id)))
                                   
        Check_Reports_Dataframe = Check_Reports_Dataframe.join(config_df, Check_Reports_Dataframe.dq_verification_config_id == config_df.dq_verification_config_id).select(Check_Reports_Dataframe['*'], config_df.br_identification_required)

        Check_Reports_Dataframe = Check_Reports_Dataframe.withColumn('bad_record_required_flag', when((col('constraint_status') == 'Failure') & (col('br_identification_required') == 'Yes'), lit('True')).otherwise(lit('False')))
        Check_Reports_Dataframe.show(truncate=False)
        return VerificationResult

In [None]:
generate_config_id = {"CompletenessConstraint(Completeness(merchant_id,None))":1,"UniquenessConstraint(Uniqueness(List(merchant_id),None))":2,"ComplianceConstraint(Compliance(merchant_id is non-negative,COALESCE(merchant_id, 0.0) >= 0,None))":3}
config_df = spark.read.json(r'..\config\dq_verification_config.json')
config_df.show(truncate=False)


sections = ["merchant_cln"]

Pydeequ_Check_Verification_object = Pydeequ_Check_Verification()
#list(map(Pydeequ_Check_Verification_object.check_fileformat_generate_dataset, sections))
r = Pydeequ_Check_Verification_object.check_fileformat_generate_dataset(sections[0])

In [29]:
Parser = ConfigParser()
configFile = r"../config/verification_config.properties"
Parser.read(configFile)
Parser.sections()

['merchant_cln']

In [35]:
Parser.get('merchant_cln','check')

'isComplete("merchant_id")|isUnique("merchant_id")|isNonNegative("merchant_id")'

In [36]:
verification_config = spark.read.format('csv').load(r'..\config\verification_config.csv',header=True,inferSchema=True)

In [38]:
verification_config.show(truncate=False)

+-------------------------+--------------------+-------------+-------------+----------+------------+-------------+-----------------------------+-----------------------------+
|dq_verification_config_id|modified_column_name|database_name|create_utc_ts|dataset_id|Table_name  |update_utc_ts|check_name                   |check                        |
+-------------------------+--------------------+-------------+-------------+----------+------------+-------------+-----------------------------+-----------------------------+
|1                        |null                |default      |null         |1001      |merchant_cln|null         |Sellbrite_merchant_cln_checks|.isComplete('merchant_id')   |
|2                        |null                |default      |null         |1001      |merchant_cln|null         |Sellbrite_merchant_cln_checks|.isUnique('merchant_id')     |
|3                        |null                |default      |null         |1001      |merchant_cln|null         |Sellbrite_m

In [3]:
config_df = spark.read.json(r'..\config\dq_verification_config.json')

In [4]:
config_df.show()

+------------+--------------------------+--------------------+-------------+-------------+----------+-------------------------+-------------+
|  Table_name|br_identification_required|               check|create_utc_ts|database_name|dataset_id|dq_verification_config_id|update_utc_ts|
+------------+--------------------------+--------------------+-------------+-------------+----------+-------------------------+-------------+
|merchant_cln|                        No|.isUnique("mercha...|             |      default|      1001|                        1|             |
|merchant_cln|                       Yes|.isComplete("merc...|             |      default|      1001|                        2|             |
|    user_cln|                        No|     .isUnique("id")|             |      default|      1002|                        3|             |
|    user_cln|                       Yes|   .isComplete("id")|             |      default|      1002|                        4|             |
+-----

In [25]:
config_df.select('Table_name').distinct().select(collect_list('Table_name')).first()[0]

['merchant_cln', 'user_cln']

In [24]:
config_df.select(collect_set('Table_name')).first()[0]

['user_cln', 'merchant_cln']

In [33]:
config_df.filter(col('Table_name')=='merchant_cln').select('database_name').first()[0]

'default'