In [2]:
from pyspark.sql import SparkSession
from pydeequ.verification import *
import pydeequ
from pydeequ.checks import *

In [3]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

In [66]:
## user

In [85]:
df = spark.read.csv(r'Templates/User - Import Template.csv', header=True)

In [86]:
df.show(truncate=False)

+--------------------------------+----------+----------+
|Email                           |First Name|Last Name |
+--------------------------------+----------+----------+
|jane.doe@bcgasample.com         |Jane      |Doe       |
|alexis.washington@bcgasample.com|Alexis    |Washington|
|reginald.anderson@bcgasample.com|Reginald  |Anderson  |
|tatiana.smithson@bgcasample.com |Tatiana   |Smithson  |
|james.sample@bcgasample.com     |James     |Sample    |
+--------------------------------+----------+----------+



In [107]:
config = spark.read.option("multiline","true").json('pydeequ_config.json')
config_df = eval(config.toJSON().collect()[0])
config_df

{'check': ".isComplete('First Name').isComplete('Last Name').hasPattern('Email','^[a-z0-9]+[\\._]?[a-z0-9]+[@]\\w+[.]\\w{2,3}$',lambda x:x==1).hasMaxLength('First Name',lambda x:x<9).hasMaxLength('Last Name',lambda x:x<9)"}

In [108]:

check = Check(spark, CheckLevel.Warning, "Review Check")
checks = "check" + config_df['check']

checkResult = VerificationSuite(spark) \
.onData(df) \
.addCheck(eval(checks)).run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show(truncate=False)

+------------+-----------+------------+-------------------------------------------------------------------------+-----------------+-----------------------------------------------------+
|check       |check_level|check_status|constraint                                                               |constraint_status|constraint_message                                   |
+------------+-----------+------------+-------------------------------------------------------------------------+-----------------+-----------------------------------------------------+
+------------+-----------+------------+-------------------------------------------------------------------------+-----------------+-----------------------------------------------------+



In [1]:
### relationship

In [4]:
df = spark.read.csv(r'Templates/Relationship - Import Template.csv', header=True)

In [5]:
df.show(truncate=False)

+----------------------+-----------------+-------------------------------------+-------------------------+-------+--------+
|Relationship Legacy ID|Contact Legacy ID|Description                          |Related Contact Legacy ID|Status |Type    |
+----------------------+-----------------+-------------------------------------+-------------------------+-------+--------+
|DEF123                |122              |Ronald McHenry is Irene maddox's son |36812245                 |Current|Parent  |
|DEF124                |156              |Related contact is contact's son     |36812452                 |Current|Parent  |
|DEF125                |36774867         |Related contact is contact's son     |36812975                 |Current|Parent  |
|DEF126                |36786050         |Related contact is contact's son     |36814590                 |Current|Friend  |
|DEF127                |36806536         |Related contact is contact's coworker|36816516                 |Current|Coworker|
+-------

In [8]:

check = Check(spark, CheckLevel.Warning, "Review Check")

checkResult = VerificationSuite(spark) \
.onData(df) \
.addCheck(
check.isUnique('Relationship Legacy ID')\
.isComplete('Contact Legacy ID')\
.isComplete('Related Contact Legacy ID')\
.isContainedIn('Status',['Current','Former'])\
.isContainedIn('Type',['Friend','Family','Coworker','Father','Mother','Parent','Son','Daughter','Child','Aunt','Uncle','Husband','Wife','Partner','Cousin','Grandmother','Grandfather','Grandparent','Grandson','Granddaughter','Grandchild','Employer','Employee','Case Worker','Teacher','Sibling','Guardian','Foster Parent','Other'])).run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show()

+------------+-----------+------------+--------------------+-----------------+------------------+
|       check|check_level|check_status|          constraint|constraint_status|constraint_message|
+------------+-----------+------------+--------------------+-----------------+------------------+
+------------+-----------+------------+--------------------+-----------------+------------------+



In [9]:
## membership

In [72]:
df = spark.read.csv(r'Templates/Membership - Import Template.csv', header=True)

In [44]:
df.show(truncate=False)

+--------------------+-----------------+-----------------+---------------+-----------------+-------------------------------------------------------------+---------------------+---------------------+-------------------+-----------------------------+
|Legacy Id           |Account Legacy Id|Contact Legacy Id|Membership Type|Membership Status|Category Name                                                |Record Type Name     |Membership Start Date|Membership End Date|Membership Category Legacy ID|
+--------------------+-----------------+-----------------+---------------+-----------------+-------------------------------------------------------------+---------------------+---------------------+-------------------+-----------------------------+
|LegacyMembershipID1 |123181           |113              |Adult          |Draft            |School Year Memberships - James T. Anderson Boys & Girls Club|Punch Pass Membership|8/2/2021             |6/1/2022           |MembershipCategoryID1        |
|Leg

In [73]:

check = Check(spark, CheckLevel.Warning, "Review Check")

checkResult = VerificationSuite(spark) \
.onData(df) \
.addCheck(
check.isUnique('Legacy ID')\
.isComplete('Legacy ID')\
.isComplete('Account Legacy Id')\
.isUnique('Account Legacy Id')\
.isComplete('Contact Legacy Id')\
.isUnique('Contact Legacy Id')\
.hasMaxLength('Category Name',lambda x:x<=255)\
.isComplete('Record Type Name')\
.isContainedIn('Record Type Name',['Punch Pass Membership', 'Standard Membership'])\
.isContainedIn('Membership Status',['On Hold','Active','Pending Active','Draft','Pending','Cancel','Pending Hold','Pending Transfer','Complete'])\
.isComplete('Membership Start Date')\
.hasPattern("Membership Start Date","\d.*\/.*\d.*\/.*\d",lambda x:x==1)\
.hasPattern("Membership End Date","\d.*\/.*\d.*\/.*\d",lambda x:x==1)\
.isComplete('Membership End Date')).run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show(truncate=False)

+------------+-----------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------------------------+
|check       |check_level|check_status|constraint                                                                                                                                                                                                                                                                                                                         |constraint_status|constraint_message                                  |
+------------+-----------+------------+-------------------------------------------------------------------------------------------

In [47]:
##facility visit

In [68]:
df = spark.read.csv(r'Templates/Facility Visit - Import Template.csv', header=True)

In [49]:
df.show()

+------------------+---------------------------------+--------------------------------+-----------------+----------------------------+--------------+--------------+-----------------------+---------------+--------------------+-----------------+-----------------+----------------------+
|         Legacy Id|Entry Scanning Location Legacy Id|Exit Scanning Location Legacy Id|Contact Legacy Id|Entry Scanning Location Name|  Scan In Time| Scan Out Time|Legacy Course Option Id|Is Program Scan|Optional Staff Stamp|Requires Scan Out|Grade as of Visit|School as of Scan Date|
+------------------+---------------------------------+--------------------------------+-----------------+----------------------------+--------------+--------------+-----------------------+---------------+--------------------+-----------------+-----------------+----------------------+
| FacVisitLegacyId1|             ScanLocationLegac...|            ExitLocationLegac...|              113|           Sample Power Hour|5/19/2021 3

In [69]:

check = Check(spark, CheckLevel.Warning, "Review Check")

checkResult = VerificationSuite(spark) \
.onData(df) \
.addCheck(
check.isUnique('Legacy ID')\
.isComplete('Legacy ID')\
.isComplete('Entry Scanning Location Legacy Id')\
.isComplete('Exit Scanning Location Legacy Id')\
.isComplete('Contact Legacy Id')\
.hasMaxLength('Entry Scanning Location Name',lambda x:x<=100)\
.isComplete('Scan In Time')\
.hasPattern("Scan In Time","\d.*\/.*\d.*\/.*\d",lambda x:x==1)\
.hasPattern("Scan Out Time","\d.*\/.*\d.*\/.*\d",lambda x:x==1)\
.hasMaxLength('Legacy Course Option Id',lambda x:x<=255)\
.hasMaxLength('Optional Staff Stamp',lambda x:x<=255)\
.hasMaxLength('Grade as of Visit',lambda x:x<=255)\
.hasMaxLength('School as of Scan Date',lambda x:x<=255)\
.isContainedIn('Requires Scan Out',['TRUE', 'FALSE'])\
.isContainedIn('Is Program Scan',['TRUE', 'FALSE'])).run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show(truncate=False)

+------------+-----------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
|check       |check_level|check_status|constraint                                                                                                                                             |constraint_status|constraint_message|
+------------+-----------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
+------------+-----------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+



In [60]:
## address

In [61]:
df = spark.read.csv(r'Templates/Address - Import Template.csv', header=True)
df.show()

+-----------------+-----------------+------------+-----------+------------+---------------+-----------------------+----------------------+--------------------+---------------+
|Address Legacy ID|Account Legacy ID|Address Type|County Name|Mailing City|Mailing Country|Mailing Zip/Postal Code|Mailing State/Province|      Mailing Street|Mailing Street2|
+-----------------+-----------------+------------+-----------+------------+---------------+-----------------------+----------------------+--------------------+---------------+
|           ABC123|           123181|        Work|   McKinney|      Austin|  United States|                  78759|                 Texas|    1620 Windham Way|         Unit 5|
|           ABC124|           123185|        Work|     Dallas|      Austin|  United States|                  78756|                 Texas|     9570 W 35th St.|           null|
|           ABC125|           123188|        Work|     Dallas|       Elgin|  United States|                  78753|     

In [67]:

check = Check(spark, CheckLevel.Warning, "Review Check")

checkResult = VerificationSuite(spark) \
.onData(df) \
.addCheck(
check.isComplete('Address Legacy ID')\
.isUnique('Address Legacy ID')\
.isComplete('Account Legacy ID')
.hasMaxLength('County Name',lambda x:x<=50)\
.isContainedIn('Address Type',['Work', 'Other','Vacation','Home'])).run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show(truncate=False)

+------------+-----------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
|check       |check_level|check_status|constraint                                                                                                                                                              |constraint_status|constraint_message|
+------------+-----------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
+------------+-----------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+

