### Notebook to demonstrate de-duplication (DeDup) using Spark Windowing function

In [1]:
from datetime import datetime, timezone

from pyspark.sql import Row, Window
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import lit, row_number, asc, desc

In [2]:
startTime = datetime.now(timezone.utc)
print('Starting at:', startTime.strftime('%a, %Y-%b-%d %H:%M:%S'))

Starting at: Thu, 2020-Jul-02 02:32:36


In [3]:
# Schema of Deals
deal_schema = 'DEAL_ID:int, DEAL_NAME:string, DEAL_TIMESTAMP:string, ACTIVE:boolean'

In [4]:
prior_data = [(100, 'Deal_1', datetime(2019, 9, 13, 10, 11, 13).strftime('%Y-%m-%d %H:%M:%S'), True),
              (200, 'Deal_22', datetime(2020, 2, 29, 0, 0, 1).strftime('%Y-%m-%d %H:%M:%S'), True),
              (300, 'Deal 333', datetime(2003, 3, 3, 3, 33, 33).strftime('%Y-%m-%d %H:%M:%S'), True),
              (400, 'My deal 40404', datetime(2018, 7, 15, 7, 15, 0).strftime('%Y-%m-%d %H:%M:%S'), True),
]

prior_data_df = spark.createDataFrame(prior_data, schema=deal_schema)
prior_data_df.printSchema()
print('Prior data set:')
prior_data_df.show()

root
 |-- DEAL_ID: integer (nullable = true)
 |-- DEAL_NAME: string (nullable = true)
 |-- DEAL_TIMESTAMP: string (nullable = true)
 |-- ACTIVE: boolean (nullable = true)

Prior data set:
+-------+-------------+-------------------+------+
|DEAL_ID|    DEAL_NAME|     DEAL_TIMESTAMP|ACTIVE|
+-------+-------------+-------------------+------+
|    100|       Deal_1|2019-09-13 10:11:13|  true|
|    200|      Deal_22|2020-02-29 00:00:01|  true|
|    300|     Deal 333|2003-03-03 03:33:33|  true|
|    400|My deal 40404|2018-07-15 07:15:00|  true|
+-------+-------------+-------------------+------+



In [5]:
# mark entire dataset as history i.e. age=1
existing_df = prior_data_df.withColumn('age', lit(1))
existing_df.printSchema()
existing_df.show()

root
 |-- DEAL_ID: integer (nullable = true)
 |-- DEAL_NAME: string (nullable = true)
 |-- DEAL_TIMESTAMP: string (nullable = true)
 |-- ACTIVE: boolean (nullable = true)
 |-- age: integer (nullable = false)

+-------+-------------+-------------------+------+---+
|DEAL_ID|    DEAL_NAME|     DEAL_TIMESTAMP|ACTIVE|age|
+-------+-------------+-------------------+------+---+
|    100|       Deal_1|2019-09-13 10:11:13|  true|  1|
|    200|      Deal_22|2020-02-29 00:00:01|  true|  1|
|    300|     Deal 333|2003-03-03 03:33:33|  true|  1|
|    400|My deal 40404|2018-07-15 07:15:00|  true|  1|
+-------+-------------+-------------------+------+---+



In [6]:
new_data = [
            # UPDATE
            (200, 'Deal AlphaBetaGamma', datetime(2020, 2, 29, 23, 59, 59).strftime('%Y-%m-%d %H:%M:%S'), True),
    
            # DEACTIVATE
            (300, 'Deal 333', datetime(2003, 3, 3, 3, 33, 33).strftime('%Y-%m-%d %H:%M:%S'), False),
    
            # NEW
            (555, 'Cinqo the Dealo', datetime(2020, 5, 5, 0, 55, 55).strftime('%Y-%m-%d %H:%M:%S'), True),
]

new_data_df = spark.createDataFrame(new_data, schema=deal_schema)
new_data_df.printSchema()
print('New data set:')
new_data_df.show()


root
 |-- DEAL_ID: integer (nullable = true)
 |-- DEAL_NAME: string (nullable = true)
 |-- DEAL_TIMESTAMP: string (nullable = true)
 |-- ACTIVE: boolean (nullable = true)

New data set:
+-------+-------------------+-------------------+------+
|DEAL_ID|          DEAL_NAME|     DEAL_TIMESTAMP|ACTIVE|
+-------+-------------------+-------------------+------+
|    200|Deal AlphaBetaGamma|2020-02-29 23:59:59|  true|
|    300|           Deal 333|2003-03-03 03:33:33| false|
|    555|    Cinqo the Dealo|2020-05-05 00:55:55|  true|
+-------+-------------------+-------------------+------+



In [7]:
# mark entire dataset as current i.e. age=0
new_df = new_data_df.withColumn('age', lit(0))
new_df.printSchema()
new_df.show()

root
 |-- DEAL_ID: integer (nullable = true)
 |-- DEAL_NAME: string (nullable = true)
 |-- DEAL_TIMESTAMP: string (nullable = true)
 |-- ACTIVE: boolean (nullable = true)
 |-- age: integer (nullable = false)

+-------+-------------------+-------------------+------+---+
|DEAL_ID|          DEAL_NAME|     DEAL_TIMESTAMP|ACTIVE|age|
+-------+-------------------+-------------------+------+---+
|    200|Deal AlphaBetaGamma|2020-02-29 23:59:59|  true|  0|
|    300|           Deal 333|2003-03-03 03:33:33| false|  0|
|    555|    Cinqo the Dealo|2020-05-05 00:55:55|  true|  0|
+-------+-------------------+-------------------+------+---+



In [8]:
# Get ready for merge
assert (existing_df.schema == new_df.schema)

all_deals_df = existing_df.union(new_df)
print('Unified (merged) data set:')
# ORDER BY DEAL_ID ASC, AGE DESC
all_deals_df.sort(asc('DEAL_ID'), all_deals_df.age.desc()).show()

Unified (merged) data set:
+-------+-------------------+-------------------+------+---+
|DEAL_ID|          DEAL_NAME|     DEAL_TIMESTAMP|ACTIVE|age|
+-------+-------------------+-------------------+------+---+
|    100|             Deal_1|2019-09-13 10:11:13|  true|  1|
|    200|            Deal_22|2020-02-29 00:00:01|  true|  1|
|    200|Deal AlphaBetaGamma|2020-02-29 23:59:59|  true|  0|
|    300|           Deal 333|2003-03-03 03:33:33|  true|  1|
|    300|           Deal 333|2003-03-03 03:33:33| false|  0|
|    400|      My deal 40404|2018-07-15 07:15:00|  true|  1|
|    555|    Cinqo the Dealo|2020-05-05 00:55:55|  true|  0|
+-------+-------------------+-------------------+------+---+



In [9]:
# Define Windowsing operation
window = Window.partitionBy('DEAL_ID').orderBy(asc('age')) # current i.e. age=0 should appear before previous i.e. age=1

In [10]:
dedup_deals_df = all_deals_df.withColumn("rownum", row_number().over(window))
print('Before processing for dedup:')
dedup_deals_df.orderBy('DEAL_ID').show()
print()
print('After processing for dedup:')
dedup_success_df = dedup_deals_df \
                    .where('rownum == 1') \
                    .drop('rownum', 'age')
dedup_success_df.orderBy('DEAL_ID').show()

Before processing for dedup:
+-------+-------------------+-------------------+------+---+------+
|DEAL_ID|          DEAL_NAME|     DEAL_TIMESTAMP|ACTIVE|age|rownum|
+-------+-------------------+-------------------+------+---+------+
|    100|             Deal_1|2019-09-13 10:11:13|  true|  1|     1|
|    200|Deal AlphaBetaGamma|2020-02-29 23:59:59|  true|  0|     1|
|    200|            Deal_22|2020-02-29 00:00:01|  true|  1|     2|
|    300|           Deal 333|2003-03-03 03:33:33| false|  0|     1|
|    300|           Deal 333|2003-03-03 03:33:33|  true|  1|     2|
|    400|      My deal 40404|2018-07-15 07:15:00|  true|  1|     1|
|    555|    Cinqo the Dealo|2020-05-05 00:55:55|  true|  0|     1|
+-------+-------------------+-------------------+------+---+------+


After processing for dedup:
+-------+-------------------+-------------------+------+
|DEAL_ID|          DEAL_NAME|     DEAL_TIMESTAMP|ACTIVE|
+-------+-------------------+-------------------+------+
|    100|             

In [11]:
# test for dedup for DEAL_NAME
test_row_deal_name_before = prior_data_df.where('DEAL_ID == 200').select('DEAL_NAME').head()
print('test_deal_name_before:', test_row_deal_name_before)
test_row_deal_name_after = dedup_success_df.where('DEAL_ID == 200').select('DEAL_NAME').head()
print('test_deal_name_after:', test_row_deal_name_after)
assert (test_row_deal_name_before.DEAL_NAME != test_row_deal_name_after.DEAL_NAME)

# test for dedup for ACTIVE
test_list_active_flag_before = prior_data_df.where('DEAL_ID == 300').select('ACTIVE').collect()
print('test_active_flag_before:', test_list_active_flag_before)
test_list_active_flag_after = dedup_success_df.where('DEAL_ID == 300').select('ACTIVE').collect()
print('test_active_flag_after:', test_list_active_flag_after)
assert (test_list_active_flag_before[0] != test_list_active_flag_after[0])

test_deal_name_before: Row(DEAL_NAME='Deal_22')
test_deal_name_after: Row(DEAL_NAME='Deal AlphaBetaGamma')
test_active_flag_before: [Row(ACTIVE=True)]
test_active_flag_after: [Row(ACTIVE=False)]


In [12]:
stopTime = datetime.now(timezone.utc)
print('Completed at:', stopTime.strftime('%a, %Y-%b-%d %H:%M:%S'))

Completed at: Thu, 2020-Jul-02 02:32:42
