In [1]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
CALCULATED_DRAW_DOWNS = dataiku.Dataset("CALCULATED_DRAW_DOWNS")
CALCULATED_DRAW_DOWNS_df = CALCULATED_DRAW_DOWNS.get_dataframe()

CALCULATED_DRAW_UPS = dataiku.Dataset("CALCULATED_DRAW_UPS")
CALCULATED_DRAW_UPS_df = CALCULATED_DRAW_UPS.get_dataframe()

COMMON_WORDS = dataiku.Dataset("NAFCUSTOMER_COMMON_WORDS_IN_NAMES")
COMMON_WORDS_df = COMMON_WORDS.get_dataframe()

In [2]:
def date_tz_naive(pd_s):
    return pd.to_datetime(pd_s).apply(lambda x:x.tz_localize(None))

In [4]:
# consider customers with the card threshold or more
# set this too low and the running time will balloon
# recommend: 10 or higher

card_cut_off_threshold = 5

df_down_full = CALCULATED_DRAW_DOWNS_df[CALCULATED_DRAW_DOWNS_df.ACTIVE_CARD_MAX>card_cut_off_threshold].copy()
df_up_full = CALCULATED_DRAW_UPS_df[CALCULATED_DRAW_UPS_df.ACTIVE_CARD_MAX>card_cut_off_threshold].copy()

df_down_full.DRAW_DOWN_DATE = date_tz_naive(df_down_full['DRAW_DOWN_DATE'])
df_up_full.DRAW_UP_DATE = date_tz_naive(df_up_full['DRAW_UP_DATE'])

df_up_full.dropna(subset=['DRAW_UP_DATE'], inplace=True)

df_common = COMMON_WORDS_df

df_down_full.sort_values(['CUSTOMER'], inplace=True)
df_up_full.sort_values(['CUSTOMER'], inplace=True)
df_common.sort_values(['WORD'], inplace=True)

print(len(df_down_full), "draw downs full")
print(len(df_up_full), "draw ups full")
print(len(df_common), "common words")

print("Card Cut Off Threshold", card_cut_off_threshold)

34027 draw downs full
72496 draw ups full
2173 common words
Card Cut Off Threshold 5


In [5]:
import string

_common_words = df_common.WORD.unique()
print(len(_common_words), "screening against common words")

class Draw_Down_Customer:

    def __init__(self, name, draw_down_date, active_card_max):

        self.CUSTOMER = name
        self.DRAW_DOWN_DATE = draw_down_date
        self.ACTIVE_CARD_MAX = active_card_max

        self.MATCHING_CUSTOMERS = []
        self.PERCENT_DIFFERENCE = []
        self.DAYS_DIFFERENCE = []
        self.DRAW_UP_DATE = []

        # remove punctuation
        c_str = name.translate(str.maketrans('', '', string.punctuation))

        f = c_str.split()
        self.WORD_LIST = []
        for w in f:
            if w not in _common_words:
                self.WORD_LIST.append(w)

    def Match_Draw_Up_Customer(self, name, draw_up_date, active_card_max):

        if (self.CUSTOMER == name):
            # exact match, already captured
            return

        c_str = name.translate(str.maketrans('', '', string.punctuation))

        f = c_str.split()

        check_list = []
        for w in f:
            if (w not in _common_words) and (len(w)>1) and (not w.isnumeric()):
                check_list.append(w)

        percent_diff = round((abs(self.ACTIVE_CARD_MAX - active_card_max) / ((self.ACTIVE_CARD_MAX+active_card_max)/2)),2)

        #date_format = "%Y-%m-%d"
        #d1_date = datetime.strptime(draw_up_date.astype(str), date_format)
        #d2_date = datetime.strptime(self.DRAW_DOWN_DATE.astype(str), date_format)

        delta_between_drop_and_rise = round(abs((draw_up_date-self.DRAW_DOWN_DATE).days)/30.,0)

        for w_to_check in check_list:
            for w in self.WORD_LIST:
                if w_to_check==w:

                    if not name in(self.MATCHING_CUSTOMERS) and(delta_between_drop_and_rise<=4)and(percent_diff<=0.5) :
                        self.MATCHING_CUSTOMERS.append(name)
                        self.PERCENT_DIFFERENCE.append(percent_diff)
                        self.DAYS_DIFFERENCE.append(delta_between_drop_and_rise)
                        self.DRAW_UP_DATE.append(draw_up_date)
                        break;

2173 screening against common words


In [6]:
import time

def do_save_log(_matching_process_log_time, _matching_process_log_event):

    df_matching_log = pd.DataFrame(_matching_process_log_time)
    if len(df_matching_log)>0:

        df_matching_log.columns = ['LOG_TIME']
        df_matching_log['LOG_EVENT'] = _matching_process_log_event

        MATCHING_PROCESS_LOG_V_df = df_matching_log
        MATCHING_PROCESS_LOG_V = dataiku.Dataset("MATCHING_PROCESS_LOG_V")
        MATCHING_PROCESS_LOG_V.write_with_schema(MATCHING_PROCESS_LOG_V_df)

        print()

def do_save_direct_matches(_direct_customer, _direct_match, _direct_draw_up_date):

    df_matches = pd.DataFrame(_direct_customer)
    if len(df_matches)>0:

        print()
        print("saving", len(df_matches), "1-1 matching records")
        print()

        df_matches.columns = ['CUSTOMER']
        df_matches["MATCH_CUSTOMER"] = _direct_match
        df_matches["DRAW_UP_DATE"] = _direct_draw_up_date

        MATCHES_1_TO_1_STAGING_V_df = df_matches
        MATCHES_1_TO_1_STAGING_V = dataiku.Dataset("MATCHES_1_TO_1_STAGING_V")
        MATCHES_1_TO_1_STAGING_V.write_with_schema(MATCHES_1_TO_1_STAGING_V_df)

        print()

def do_save_multiple_matches(_multiple_customer, _multiple_matches, _multiple_drop_dates):

    df_multiple_matches = pd.DataFrame(_multiple_customer)

    if len(df_multiple_matches)>0:

        print()
        print("saving", len(df_multiple_matches), "1-n matching records")
        print()

        df_multiple_matches.columns = ['CUSTOMER']
        df_multiple_matches["MATCH_CUSTOMER"] = _multiple_matches
        df_multiple_matches["DRAW_UP_DATE"] = _multiple_drop_dates

        MATCHES_1_TO_N_STAGING_V_df = df_multiple_matches
        MATCHES_1_TO_N_STAGING_V = dataiku.Dataset("MATCHES_1_TO_N_STAGING_V")
        MATCHES_1_TO_N_STAGING_V.write_with_schema(MATCHES_1_TO_N_STAGING_V_df)

        print()

In [0]:
from datetime import timedelta

df_down = df_down_full
df_up = df_up_full

_processed_customers = []
verbose = False

_matching_process_log_time = []
_matching_process_log_event = []

_direct_customer = []
_direct_match = []
_direct_draw_up_date = []

_multiple_customer = []
_multiple_matches = []
_multiple_drop_dates = []

_no_match_customer = []

save_every_n = 50
to_save_counter = 0
print_every_n = 25

print(len(df_down), "filtered down rows")
print(len(df_up), "filtered up rows")

_customers = []

t0 = time.time()

for index, row in df_down.iterrows():

    customer = row['CUSTOMER']
    draw_down_date = row['DRAW_DOWN_DATE']
    active_card_max = row['ACTIVE_CARD_MAX']

    c = Draw_Down_Customer(customer, draw_down_date, active_card_max)

    _customers.append(c)

idx = 0

_matching_process_log_time.append(str(pd.Timestamp.now()))
_matching_process_log_event.append(" processing range " + str(len(_customers)) + " Draw Down Customers")
do_save_log(_matching_process_log_time, _matching_process_log_event)

for c in _customers:

    idx+=1

    date_start = pd.to_datetime(c.DRAW_DOWN_DATE) +timedelta(days=-120)
    date_end = pd.to_datetime(c.DRAW_DOWN_DATE) +timedelta(days=120)

    card_delta = c.ACTIVE_CARD_MAX * 0.5
    card_start = c.ACTIVE_CARD_MAX - card_delta
    card_end = c.ACTIVE_CARD_MAX + card_delta

    df_up = df_up_full[(df_up_full.ACTIVE_CARD_MAX>=card_start)&
                   (df_up_full.ACTIVE_CARD_MAX<=card_end)&
                    (df_up_full.DRAW_UP_DATE >= pd.to_datetime(date_start))&
                  (df_up_full.DRAW_UP_DATE <= pd.to_datetime(date_end))]

    for index_up, row_up in df_up.iterrows():

        customer = row_up['CUSTOMER']
        draw_up_date = row_up['DRAW_UP_DATE']
        active_card_max = row_up['ACTIVE_CARD_MAX']

        c.Match_Draw_Up_Customer(customer, draw_up_date, active_card_max)

    if len(c.MATCHING_CUSTOMERS)==1:

        if not c.CUSTOMER in (_processed_customers):

            to_save_counter += 1

            _direct_customer.append(c.CUSTOMER)
            _processed_customers.append(c.CUSTOMER)
            _direct_match.append(c.MATCHING_CUSTOMERS[0])
            _processed_customers.append(c.MATCHING_CUSTOMERS[0])
            _direct_draw_up_date.append(c.DRAW_UP_DATE[0])

            if verbose:
                print()
                print("DIRECT")
                print(c.CUSTOMER, c.WORD_LIST)
                print(c.MATCHING_CUSTOMERS)
                print(c.PERCENT_DIFFERENCE)
                print(c.DAYS_DIFFERENCE)
                print("=====")
                print()

    elif len(c.MATCHING_CUSTOMERS)>1:

        if not c.CUSTOMER in (_processed_customers):

            to_save_counter += 1

            _multiple_customer.append(c.CUSTOMER)
            _processed_customers.append(c.CUSTOMER)
            _multiple_matches.append(c.MATCHING_CUSTOMERS)
            _multiple_drop_dates.append(c.DRAW_UP_DATE)

        if verbose:
            print()
            print("MULTIPLE")
            print(c.CUSTOMER, c.WORD_LIST)
            print(c.MATCHING_CUSTOMERS)
            print(c.PERCENT_DIFFERENCE)
            print(c.DAYS_DIFFERENCE)
            print("=====")
            print()

    else:

        # could not find a match, remove it from future processing
        _no_match_customer.append(c.CUSTOMER)
        _processed_customers.append(c.CUSTOMER)

    if to_save_counter>=save_every_n:

        _matching_process_log_time.append(str(pd.Timestamp.now()))
        _matching_process_log_event.append("writing datasets to snowflake")
        do_save_log(_matching_process_log_time, _matching_process_log_event)

        do_save_direct_matches(_direct_customer, _direct_match, _direct_draw_up_date)
        do_save_multiple_matches(_multiple_customer, _multiple_matches, _multiple_drop_dates)

        _matching_process_log_time.append(str(pd.Timestamp.now()))
        _matching_process_log_event.append("saved " + str(to_save_counter) + " records to snowflake.")
        do_save_log(_matching_process_log_time, _matching_process_log_event)

        to_save_counter = 0

    t1 = time.time()

    avg_duration = (((t1-t0)/idx)/60.0)

    if idx % print_every_n == 0:
        idx_remaining = len(_customers)-idx
        print("processing", idx, "current record:", c.CUSTOMER, ",", idx_remaining, "remaining")
        print(round(avg_duration,2), "avg mins per iteration",  round((avg_duration*idx_remaining)/60,2), "estimated hrs remaining")
        print(len(_direct_customer), "direct match records", len(_multiple_customer), "multiple match records", len(_no_match_customer), "no match records")
        print()

_matching_process_log_time.append(str(pd.Timestamp.now()))
_matching_process_log_event.append("writing datasets to snowflake")
do_save_log(_matching_process_log_time, _matching_process_log_event)

do_save_direct_matches(_direct_customer, _direct_match, _direct_draw_up_date)
do_save_multiple_matches(_multiple_customer, _multiple_matches, _multiple_drop_dates)

_matching_process_log_time.append(str(pd.Timestamp.now()))
_matching_process_log_event.append("saved " + str(to_save_counter) + " records to snowflake.")
do_save_log(_matching_process_log_time, _matching_process_log_event)

34027 filtered down rows
72496 filtered up rows
1 rows successfully written (vW7Nhuhqit)

processing 25 current record: 1 SOURCE LLC , 34002 remaining
0.03 avg mins per iteration 18.65 estimated hrs remaining
4 direct match records 0 multiple match records 21 no match records

processing 50 current record: 1000198 SPM OIL  GAS INC V , 33977 remaining
0.03 avg mins per iteration 15.04 estimated hrs remaining
7 direct match records 1 multiple match records 42 no match records

processing 75 current record: 1089443 ONT LTD DBA ERAC , 33952 remaining
0.02 avg mins per iteration 13.09 estimated hrs remaining
8 direct match records 1 multiple match records 66 no match records

processing 100 current record: 14 HRS PROD INC  IN THE DARK , 33927 remaining
0.02 avg mins per iteration 14.02 estimated hrs remaining
9 direct match records 2 multiple match records 89 no match records

processing 125 current record: 1656908 ONTARIO LTD , 33902 remaining
0.03 avg mins per iteration 14.23 estimated hr

processing 1025 current record: A E S LIMOUSINE SERVICE INC , 33002 remaining
0.03 avg mins per iteration 14.82 estimated hrs remaining
68 direct match records 29 multiple match records 928 no match records

processing 1050 current record: A HAK INDUSTRIAL SERVICES , 32977 remaining
0.03 avg mins per iteration 14.79 estimated hrs remaining
69 direct match records 29 multiple match records 952 no match records

processing 1075 current record: A LA CARTE INC , 32952 remaining
0.03 avg mins per iteration 14.8 estimated hrs remaining
70 direct match records 29 multiple match records 976 no match records

4 rows successfully written (WjwoFlINYg)


saving 71 1-1 matching records

71 rows successfully written (HyknQRpSTz)


saving 29 1-n matching records

29 rows successfully written (sHEt8SCtcm)

5 rows successfully written (m3NVFjm75M)

processing 1100 current record: A MOMENT CAPTURED , 32927 remaining
0.03 avg mins per iteration 14.93 estimated hrs remaining
71 direct match records 29 mul

processing 1950 current record: AIR DOCTOR COMMERCIAL REFRIGER , 32077 remaining
0.03 avg mins per iteration 15.54 estimated hrs remaining
159 direct match records 50 multiple match records 1741 no match records

processing 1975 current record: AIR ONE INDUSTRIES INC , 32052 remaining
0.03 avg mins per iteration 15.58 estimated hrs remaining
160 direct match records 50 multiple match records 1765 no match records

processing 2000 current record: AIRCOOL TECH A C T CORP , 32027 remaining
0.03 avg mins per iteration 15.54 estimated hrs remaining
161 direct match records 51 multiple match records 1788 no match records

processing 2025 current record: AIRTON HEATING  AIR CONDITIONING CO , 32002 remaining
0.03 avg mins per iteration 15.54 estimated hrs remaining
162 direct match records 52 multiple match records 1811 no match records

processing 2050 current record: AJT TECHNOLOGY DESIGNS INC , 31977 remaining
0.03 avg mins per iteration 15.54 estimated hrs remaining
166 direct match record

processing 2850 current record: ANADARKO PETROLEUM CORP (0G72) , 31177 remaining
0.03 avg mins per iteration 15.83 estimated hrs remaining
273 direct match records 88 multiple match records 2488 no match records

processing 2875 current record: ANDERSON BOGERT ENGINEERS & SU , 31152 remaining
0.03 avg mins per iteration 15.81 estimated hrs remaining
276 direct match records 89 multiple match records 2509 no match records

processing 2900 current record: ANDREAS PROSTHESES INC , 31127 remaining
0.03 avg mins per iteration 15.84 estimated hrs remaining
279 direct match records 89 multiple match records 2531 no match records

processing 2925 current record: ANGEL RIDGE PROPERTIES , 31102 remaining
0.03 avg mins per iteration 15.89 estimated hrs remaining
281 direct match records 90 multiple match records 2553 no match records

processing 2950 current record: ANN ARBOR WELDING SUPPLY COMPANY , 31077 remaining
0.03 avg mins per iteration 15.89 estimated hrs remaining
283 direct match record

processing 3750 current record: AUTOMATIC ENTRANCES INC , 30277 remaining
0.03 avg mins per iteration 15.63 estimated hrs remaining
390 direct match records 128 multiple match records 3231 no match records

processing 3775 current record: AUTUMN TRANSPORTATION 0496 , 30252 remaining
0.03 avg mins per iteration 15.65 estimated hrs remaining
393 direct match records 130 multiple match records 3251 no match records

processing 3800 current record: AVDS INC , 30227 remaining
0.03 avg mins per iteration 15.68 estimated hrs remaining
396 direct match records 135 multiple match records 3268 no match records

processing 3825 current record: AWHAP ACQUISITION CORP , 30202 remaining
0.03 avg mins per iteration 15.68 estimated hrs remaining
400 direct match records 137 multiple match records 3287 no match records

processing 3850 current record: AYERS DRYWALL INSULATION & SUP , 30177 remaining
0.03 avg mins per iteration 15.64 estimated hrs remaining
404 direct match records 137 multiple match re

processing 4675 current record: BEST ONE TIRE  SERVICE , 29352 remaining
0.03 avg mins per iteration 15.32 estimated hrs remaining
519 direct match records 170 multiple match records 3985 no match records

processing 4700 current record: BETTS CONCRETE , 29327 remaining
0.03 avg mins per iteration 15.31 estimated hrs remaining
522 direct match records 173 multiple match records 4004 no match records

28 rows successfully written (BA5pUAkAc7)


saving 526 1-1 matching records

526 rows successfully written (eRVHzQSWzC)


saving 174 1-n matching records

174 rows successfully written (YMuXLPYFnT)

29 rows successfully written (K5odenC0sS)

processing 4725 current record: BHARWANI PROPERTIES LLC , 29302 remaining
0.03 avg mins per iteration 15.31 estimated hrs remaining
529 direct match records 174 multiple match records 4021 no match records

processing 4750 current record: BIG BLADE SHARPENING SERVICE LLC , 29277 remaining
0.03 avg mins per iteration 15.32 estimated hrs remaining
535 di

processing 5550 current record: BROWNSVILLE CONSULTING SERVICES LLC , 28477 remaining
0.03 avg mins per iteration 14.95 estimated hrs remaining
677 direct match records 212 multiple match records 4659 no match records

processing 5575 current record: BRUNSWICK TRANSIT SYSTEM INC , 28452 remaining
0.03 avg mins per iteration 14.94 estimated hrs remaining
684 direct match records 214 multiple match records 4675 no match records

36 rows successfully written (RV1KPIdapB)


saving 686 1-1 matching records

686 rows successfully written (6j8hIjhN8c)


saving 214 1-n matching records

214 rows successfully written (Dq13Ki20oY)

37 rows successfully written (NdQCZqjQnE)

processing 5600 current record: BTG AMERICAS INC 0T25 , 28427 remaining
0.03 avg mins per iteration 14.94 estimated hrs remaining
686 direct match records 215 multiple match records 4697 no match records

processing 5625 current record: BUDDYS TOWING  RECOVERY , 28402 remaining
0.03 avg mins per iteration 14.93 estimated hrs 

processing 6450 current record: CARIBE FLOW HVAC ENGINEERS INC , 27577 remaining
0.03 avg mins per iteration 14.61 estimated hrs remaining
767 direct match records 247 multiple match records 5433 no match records

processing 6475 current record: CARLO LIZZASONS PAVING INC , 27552 remaining
0.03 avg mins per iteration 14.59 estimated hrs remaining
770 direct match records 248 multiple match records 5454 no match records

processing 6500 current record: CAROLINA COASTAL RAILWAY INC , 27527 remaining
0.03 avg mins per iteration 14.59 estimated hrs remaining
773 direct match records 249 multiple match records 5475 no match records

processing 6525 current record: CAROLINAS RESURFACING , 27502 remaining
0.03 avg mins per iteration 14.57 estimated hrs remaining
775 direct match records 250 multiple match records 5497 no match records

processing 6550 current record: CARRIAGE HILLS VACATION OWNERS ASSOC , 27477 remaining
0.03 avg mins per iteration 14.56 estimated hrs remaining
780 direct mat

processing 7350 current record: CITY OF HEMET , 26677 remaining
0.03 avg mins per iteration 14.1 estimated hrs remaining
893 direct match records 286 multiple match records 6166 no match records

processing 7375 current record: CITY OF MARTINSVILLE , 26652 remaining
0.03 avg mins per iteration 14.08 estimated hrs remaining
896 direct match records 286 multiple match records 6188 no match records

processing 7400 current record: CITY OF SAN ANTONIO , 26627 remaining
0.03 avg mins per iteration 14.06 estimated hrs remaining
902 direct match records 287 multiple match records 6206 no match records

processing 7425 current record: CITY OF WICHITA , 26602 remaining
0.03 avg mins per iteration 14.05 estimated hrs remaining
904 direct match records 287 multiple match records 6229 no match records

processing 7450 current record: CIVILTECH ENGINEERING INC , 26577 remaining
0.03 avg mins per iteration 14.03 estimated hrs remaining
905 direct match records 287 multiple match records 6253 no matc

processing 8250 current record: CONSOLIDATED GRAIN AND BARG 2 , 25777 remaining
0.03 avg mins per iteration 13.61 estimated hrs remaining
997 direct match records 311 multiple match records 6937 no match records

processing 8275 current record: CONSTRUCTION DUROY INC , 25752 remaining
0.03 avg mins per iteration 13.6 estimated hrs remaining
999 direct match records 314 multiple match records 6957 no match records

processing 8300 current record: CONSUMERS PETROLEUM OF CONNECTICUT INCORPORATED , 25727 remaining
0.03 avg mins per iteration 13.57 estimated hrs remaining
1003 direct match records 315 multiple match records 6977 no match records

processing 8325 current record: CONTINENTAL STEEL FABRICATION LLC , 25702 remaining
0.03 avg mins per iteration 13.56 estimated hrs remaining
1005 direct match records 315 multiple match records 7000 no match records

processing 8350 current record: CONTROLLED CLIMATE SYSTEMS INC , 25677 remaining
0.03 avg mins per iteration 13.54 estimated hrs rem

processing 9175 current record: D2W ELECTRIC INC , 24852 remaining
0.03 avg mins per iteration 13.15 estimated hrs remaining
1107 direct match records 336 multiple match records 7726 no match records

processing 9200 current record: DAIICHI SANKYO COMPANY LIMITED , 24827 remaining
0.03 avg mins per iteration 13.14 estimated hrs remaining
1110 direct match records 338 multiple match records 7746 no match records

58 rows successfully written (MdmBrK4aFI)


saving 1112 1-1 matching records

1112 rows successfully written (PTExkvZCja)


saving 338 1-n matching records

338 rows successfully written (w1OUQnGN2l)

59 rows successfully written (qPWlL0f9nu)

processing 9225 current record: DALE HAYES MASONRY , 24802 remaining
0.03 avg mins per iteration 13.13 estimated hrs remaining
1114 direct match records 338 multiple match records 7767 no match records

processing 9250 current record: DAMIANS FLEET LOGISTICS INC , 24777 remaining
0.03 avg mins per iteration 13.13 estimated hrs remaining
1

64 rows successfully written (F6uxKGjla5)


saving 1225 1-1 matching records

1225 rows successfully written (dWvGjL014f)


saving 375 1-n matching records

375 rows successfully written (zLsDxXNnXk)

65 rows successfully written (mPHJxisRaA)

