In [25]:
import findspark
findspark.init()
from pyspark.sql import SparkSession, Row
import pydeequ, pyspark
from pyspark.sql.types import *
from pydeequ import Check,CheckLevel
from pydeequ.checks import *
from pydeequ.verification import *
from pydeequ.analyzers import *
from pyspark.sql.functions import lit,current_timestamp
from configparser import ConfigParser
import sys
import calendar,time

In [26]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.jars", "C:\Program Files (x86)\PostgreSQL\pgJDBC\postgresql-42.2.18.jar") \
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

In [27]:
def getJdbcUrl():
    jdbcUsername = "vjadhav"
    jdbcPassword = "K_GG3N7D49kUsPCweLAAVA"
    jdbcHostname = "c.fnfd5bptoazdphlg6wknuvzcwse.db.citusdata.com"
    jdbcPort = 5432
    jdbcDatabase = "citus"
    jdbcUrl = f'jdbc:postgresql://{jdbcHostname}:{jdbcPort}/{jdbcDatabase}?user={jdbcUsername}&password={jdbcPassword}'
    return jdbcUrl

In [54]:
def stageFromJdbc(jdbcUrl = getJdbcUrl()):

    
    jdbcTable = (spark.read.format("jdbc")
                           .options(url=jdbcUrl, query="select id, ordered_at, created_at, items_count, cancelled_at from orders limit 5000" , driver="org.postgresql.Driver")
                           .load())    
    
    return jdbcTable

In [55]:
orders = stageFromJdbc()
orders.printSchema()

root
 |-- id: integer (nullable = true)
 |-- ordered_at: timestamp (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- items_count: integer (nullable = true)
 |-- cancelled_at: timestamp (nullable = true)



In [56]:
orders.show(20,False)

+---------+-----------------------+--------------------------+-----------+------------+
|id       |ordered_at             |created_at                |items_count|cancelled_at|
+---------+-----------------------+--------------------------+-----------+------------+
|98637658 |2018-09-01 20:56:31    |2018-09-01 21:03:26.230637|1          |null        |
|252669562|2020-10-14 23:32:17.265|2020-10-15 00:57:43.115351|1          |null        |
|160334362|2019-09-11 20:57:02    |2019-09-11 20:59:09.093868|1          |null        |
|184036547|2020-01-14 00:54:31    |2020-01-22 02:23:11.04647 |1          |null        |
|266869465|2020-11-28 20:11:40.815|2020-11-28 20:23:21.807017|1          |null        |
|262444858|2020-11-16 21:11:24    |2020-11-16 21:22:27.736685|6          |null        |
|275319379|2020-12-16 02:32:52.824|2020-12-16 02:43:06.309432|1          |null        |
|98646879 |2018-09-01 23:08:41    |2018-09-01 23:19:00.643747|1          |null        |
|267831751|2020-11-30 23:58:26.8

In [58]:
orders.createOrReplaceTempView("tb")
spark.sql("select * from tb where ordered_at > created_at").show(10,False)

+---------+-------------------+--------------------------+-----------+------------+
|id       |ordered_at         |created_at                |items_count|cancelled_at|
+---------+-------------------+--------------------------+-----------+------------+
|262729346|2020-11-18 23:40:33|2020-11-17 18:08:06.650854|null       |null        |
|266873198|2020-11-29 20:31:18|2020-11-28 20:33:17.262946|null       |null        |
|266873242|2020-11-28 22:35:07|2020-11-28 20:33:24.204954|null       |null        |
+---------+-------------------+--------------------------+-----------+------------+



In [59]:
spark.sql("select count(*) from tb where cancelled_at > ordered_at").show()

+--------+
|count(1)|
+--------+
|     105|
+--------+



In [60]:
spark.sql("select count(*) from tb where items_count > 1").show()

+--------+
|count(1)|
+--------+
|     332|
+--------+



In [62]:
analysisResult = AnalysisRunner(spark) \
                    .onData(orders) \
                    .addAnalyzer(Size()) \
                    .addAnalyzer(Compliance("orderDate greater than createdDate", "ordered_at > created_at")) \
                    .addAnalyzer(Compliance("items_count more than 1","items_count > 1")) \
                    .addAnalyzer(Compliance("cancelDate greater than createdDate", "cancelled_at > created_at"))\
                    .addAnalyzer(Completeness("id"))\
                    .addAnalyzer(Distinctness("id"))\
                    .addAnalyzer(Minimum("items_count"))\
                    .addAnalyzer(Maximum("items_count"))\
                    .addAnalyzer(Completeness("items_count"))\
                    .addAnalyzer(Histogram("items_count"))\
                    .addAnalyzer(CountDistinct("items_count")) \
                    .run()


analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show(100,False)

+-------+-----------------------------------+-------------------------+------+
|entity |instance                           |name                     |value |
+-------+-----------------------------------+-------------------------+------+
|Column |items_count                        |Histogram.bins           |19.0  |
|Column |items_count                        |Histogram.abs.34         |1.0   |
|Column |items_count                        |Histogram.ratio.34       |2.0E-4|
|Column |items_count                        |Histogram.abs.12         |1.0   |
|Column |items_count                        |Histogram.ratio.12       |2.0E-4|
|Column |items_count                        |Histogram.abs.8          |6.0   |
|Column |items_count                        |Histogram.ratio.8        |0.0012|
|Column |items_count                        |Histogram.abs.19         |1.0   |
|Column |items_count                        |Histogram.ratio.19       |2.0E-4|
|Column |items_count                        |Histogr

In [28]:
orders.printSchema()

root
 |-- id: integer (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- channel_id: integer (nullable = true)
 |-- order_ref: string (nullable = true)
 |-- billing_address_1: string (nullable = true)
 |-- billing_address_2: string (nullable = true)
 |-- billing_address_3: string (nullable = true)
 |-- billing_postal_code: string (nullable = true)
 |-- billing_region: string (nullable = true)
 |-- billing_city: string (nullable = true)
 |-- billing_country: string (nullable = true)
 |-- billing_country_code: string (nullable = true)
 |-- billing_phone_number: string (nullable = true)
 |-- billing_contact: string (nullable = true)
 |-- billing_email: string (nullable = true)
 |-- billing_company_name: string (nullable = true)
 |-- shipping_address_1: string (nullable = true)
 |-- shipping_address_2: string (nullable = true)
 |-- shipping_address_3: string (nullable = true)
 |-- shipping_postal_code: string (nullable = true)
 |-- shipping_region: string (nullable = true)
