In [11]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
CALCULATED_CARD_DRAW_UPS = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS")
CALCULATED_CARD_DRAW_UPS_df = CALCULATED_CARD_DRAW_UPS.get_dataframe()

CALCULATED_CARD_DRAW_UPS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS_FULL")
CALCULATED_CARD_DRAW_UPS_FULL_df = CALCULATED_CARD_DRAW_UPS_FULL.get_dataframe()

CALCULATED_CARD_DRAW_DOWNS = dataiku.Dataset("CALCULATED_CARD_DRAW_DOWNS")
CALCULATED_CARD_DRAW_DOWNS_df = CALCULATED_CARD_DRAW_DOWNS.get_dataframe()

CALCULATED_CARD_DRAW_DOWNS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_DOWNS_FULL")
CALCULATED_CARD_DRAW_DOWNS_FULL_df = CALCULATED_CARD_DRAW_DOWNS_FULL.get_dataframe()

COMMON_WORDS = dataiku.Dataset("NAFCUSTOMER_COMMON_WORDS_IN_NAMES")
COMMON_WORDS_df = COMMON_WORDS.get_dataframe()

In [12]:
df_down = CALCULATED_CARD_DRAW_DOWNS_df
df_down_full = CALCULATED_CARD_DRAW_DOWNS_FULL_df
df_up = CALCULATED_CARD_DRAW_UPS_df
df_up_full = CALCULATED_CARD_DRAW_UPS_FULL_df
df_common = COMMON_WORDS_df

df_down.sort_values(['CUSTOMER'], inplace=True)
df_down_full.sort_values(['CUSTOMER'], inplace=True)
df_up.sort_values(['CUSTOMER'], inplace=True)
df_up_full.sort_values(['CUSTOMER'], inplace=True)
df_common.sort_values(['WORD'], inplace=True)

print(len(df_down), "draw downs")
print(len(df_down_full), "draw downs full")
print(len(df_up), "draw ups")
print(len(df_up_full), "draw ups full")
print(len(df_common), "common words")

194 draw downs
182341 draw downs full
322 draw ups
351542 draw ups full
2173 common words


In [24]:
df_down.head()

Unnamed: 0,CUSTOMER,DRAW_DOWN_DATE,MEAN_DD,STD_DD,ACTIVE_CARD_MAX
20662,C A AUTO GLASS,2019-06-01 00:00:00+00:00,,,1.0
20661,C A PAINT REMODEL INC,2021-08-01 00:00:00+00:00,7.333333,1.61433,10.0
20660,C A PAINTING LLC,2021-07-01 00:00:00+00:00,2.25,0.965307,5.0
20659,C A PLUMBING HEATING AND COOLING,2021-02-01 00:00:00+00:00,3.916667,0.288675,4.0
20658,C A RESIDENTIAL SOLUTIONS LLC,2021-07-01 00:00:00+00:00,,,2.0


In [37]:
print(len(df_down_full))
df_down = df_down_full[df_down_full.CUSTOMER.str.startswith('A')]
df_down = df_down[df_down['DRAW_DOWN_DATE'].dt.year==2022]
print(len(df_down))

print(len(df_up_full))
df_up = df_up_full[df_up_full.CUSTOMER.str.startswith('A')]
df_up = df_up[df_up['DRAW_UP_DATE'].dt.year==2022]
print(len(df_up))

182341
1252
351542
4706


In [22]:
print(len(df_up))
df_up = df_up[df_up['DRAW_UP_DATE'].dt.year==2022]
print(len(df_up))

30756
4804


In [13]:
df_down.head()
df_common.head()

Unnamed: 0,WORD,COUNTS
1459,1ST,337
923,5R82,192
1725,A1,517
1194,AA,244
1372,AAA,298


In [14]:
import string

_common_words = df_common.WORD.unique()
print(len(_common_words), "screening against common words")

class Draw_Down_Customer:

    def __init__(self, name, draw_down_date, mean_dd, std_dd, active_card_max):

        self.CUSTOMER = name
        self.DRAW_DOWN_DATE = draw_down_date
        self.ACTIVE_CARD_MAX = active_card_max

        self.MATCHING_CUSTOMERS = []
        self.PERCENT_DIFFERENCE = []
        self.DAYS_DIFFERENCE = []
        self.DRAW_UP_DATE = []

        # remove punctuation
        c_str = name.translate(str.maketrans('', '', string.punctuation))

        f = c_str.split()
        self.WORD_LIST = []
        for w in f:
            if w not in _common_words:
                self.WORD_LIST.append(w)

    def Match_Draw_Up_Customer(self, name, draw_up_date, mean_du, std_du, active_card_max):

        if (self.CUSTOMER == name):
            # exact match, already captured
            return

        c_str = name.translate(str.maketrans('', '', string.punctuation))

        f = c_str.split()

        check_list = []
        for w in f:
            if (w not in _common_words) and (len(w)>1) and (not w.isnumeric()):
                check_list.append(w)

        percent_diff = round((abs(self.ACTIVE_CARD_MAX - active_card_max) / ((self.ACTIVE_CARD_MAX+active_card_max)/2)),2)

        #date_format = "%Y-%m-%d"
        #d1_date = datetime.strptime(draw_up_date.astype(str), date_format)
        #d2_date = datetime.strptime(self.DRAW_DOWN_DATE.astype(str), date_format)

        delta_between_drop_and_rise = round(abs((draw_up_date-self.DRAW_DOWN_DATE).days)/30.,0)

        for w_to_check in check_list:
            for w in self.WORD_LIST:
                if w_to_check==w:

                    if not name in(self.MATCHING_CUSTOMERS) and(delta_between_drop_and_rise<=4)and(percent_diff<=0.5) :
                        self.MATCHING_CUSTOMERS.append(name)
                        self.PERCENT_DIFFERENCE.append(percent_diff)
                        self.DAYS_DIFFERENCE.append(delta_between_drop_and_rise)
                        self.DRAW_UP_DATE.append(draw_up_date)
                        break;

2173 screening against common words


In [0]:
import time

def do_save_log(_matching_process_log_time, _matching_process_log_event):

    df_matching_log = pd.DataFrame(_matching_process_log_time)
    if len(df_matching_log)>0:

        df_matching_log.columns = ['LOG_TIME']
        df_matching_log['LOG_EVENT'] = _matching_process_log_event

        MATCHING_PROCESS_LOG_df = df_matching_log
        MATCHING_PROCESS_LOG = dataiku.Dataset("MATCHING_PROCESS_LOG")
        MATCHING_PROCESS_LOG.write_with_schema(MATCHING_PROCESS_LOG_df)

        print()

def do_save_direct_matches(_direct_customer, _direct_match, _direct_draw_up_date):

    df_matches = pd.DataFrame(_direct_customer)
    if len(df_matches)>0:

        print()
        print("saving", len(df_matches), "1-1 matching records")
        print()

        df_matches.columns = ['CUSTOMER']
        df_matches["MATCH_CUSTOMER"] = _direct_match
        df_matches["DRAW_UP_DATE"] = _direct_draw_up_date

        MATCHES_1_TO_1_df = df_matches
        MATCHES_1_TO_1 = dataiku.Dataset("MATCHES_1_TO_1")
        MATCHES_1_TO_1.write_with_schema(MATCHES_1_TO_1_df)

        print()

def do_save_multiple_matches(_multiple_customer, _multiple_matches, _multiple_drop_dates):

    df_multiple_matches = pd.DataFrame(_multiple_customer)

    if len(df_multiple_matches)>0:

        print()
        print("saving", len(df_multiple_matches), "1-n matching records")
        print()

        df_multiple_matches.columns = ['CUSTOMER']
        df_multiple_matches["MATCH_CUSTOMER"] = _multiple_matches
        df_multiple_matches["DRAW_UP_DATE"] = _multiple_drop_dates

        MATCHES_1_TO_N_FOR_MANUAL_REVIEW_df = df_multiple_matches
        MATCHES_1_TO_N_FOR_MANUAL_REVIEW = dataiku.Dataset("MATCHES_1_TO_N_FOR_MANUAL_REVIEW")
        MATCHES_1_TO_N_FOR_MANUAL_REVIEW.write_with_schema(MATCHES_1_TO_N_FOR_MANUAL_REVIEW_df)

        print()


_processed_customers = []
verbose = True

process_ranges = [[100000,1000],[1100,900],[1000,600],[700,400],[500,200],[300,100],[200,70],
                  [100,50],[60,40],[55,45],[40,20],[30,10],[30,0]]

process_ranges = ['A','B','C']

_matching_process_log_time = []
_matching_process_log_event = []

_direct_customer = []
_direct_match = []
_direct_draw_up_date = []

_multiple_customer = []
_multiple_matches = []
_multiple_drop_dates = []

save_every_n = 50
to_save_counter = 0
print_every_n = 25

r_max = r[0]
r_min = r[1]

#df_down = df_down_full[(df_down_full.ACTIVE_CARD_MAX<=r_max)&(df_down_full.ACTIVE_CARD_MAX>=r_min)]
#df_up = df_up_full[(df_up_full.ACTIVE_CARD_MAX<=r_max)&(df_up_full.ACTIVE_CARD_MAX>=r_min)]

df_down = df_down_full[df_down_full.CUSTOMER.str.startswith('B')]
df_down = df_down[df_down['DRAW_DOWN_DATE'].dt.year==2022]

df_up = df_up_full[df_up_full.CUSTOMER.str.startswith('B')]
df_up = df_up[df_up['DRAW_UP_DATE'].dt.year==2022]

print(len(df_down), "filtered down rows")
print(len(df_up), "filtered up rows")

_customers = []
# Prepare Customer Set for processing

t0 = time.time()

for index, row in df_down.iterrows():

    customer = row['CUSTOMER']
    draw_down_date = row['DRAW_DOWN_DATE']
    mean_dd = row['MEAN_DD']
    std_dd = row['STD_DD']
    active_card_max = row['ACTIVE_CARD_MAX']

    c = Draw_Down_Customer(customer, draw_down_date, mean_dd, std_dd, active_card_max)

    _customers.append(c)

idx = 0

_matching_process_log_time.append(str(pd.Timestamp.now()))
_matching_process_log_event.append(" processing range from " + str(r_max) + " to " + str(r_min) + " processing " + str(len(_customers)) + " Draw Down Customers")
do_save_log(_matching_process_log_time, _matching_process_log_event)

if verbose:
    print(" processing range from " + str(r_max) + " to " + str(r_min) + " processing " + str(len(_customers)) + " Draw Down Customers")

for c in _customers:

    idx+=1
    
    for index_up, row_up in df_up.iterrows():

        customer = row_up['CUSTOMER']
        draw_up_date = row_up['DRAW_UP_DATE']
        mean_du = row_up['MEAN_DU']
        std_du = row_up['STD_DU']
        active_card_max = row_up['ACTIVE_CARD_MAX']

        c.Match_Draw_Up_Customer(customer, draw_up_date, mean_du, std_du, active_card_max)

    if len(c.MATCHING_CUSTOMERS)==1:

        if not c.CUSTOMER in (_processed_customers):

            to_save_counter += 1

            _direct_customer.append(c.CUSTOMER)
            _processed_customers.append(c.CUSTOMER)
            _direct_match.append(c.MATCHING_CUSTOMERS[0])
            _processed_customers.append(c.MATCHING_CUSTOMERS[0])
            _direct_draw_up_date.append(c.DRAW_UP_DATE[0])

            if verbose:
                print()
                print("DIRECT")
                print(c.CUSTOMER, c.WORD_LIST)
                print(c.MATCHING_CUSTOMERS)
                print(c.PERCENT_DIFFERENCE)
                print(c.DAYS_DIFFERENCE)
                print("=====")
                print()

    elif len(c.MATCHING_CUSTOMERS)>1:

        if not c.CUSTOMER in (_processed_customers):

            to_save_counter += 1

            _multiple_customer.append(c.CUSTOMER)
            _processed_customers.append(c.CUSTOMER)
            _multiple_matches.append(c.MATCHING_CUSTOMERS)
            _multiple_drop_dates.append(c.DRAW_UP_DATE)

        if verbose:
            print()
            print("MULTIPLE")
            print(c.CUSTOMER, c.WORD_LIST)
            print(c.MATCHING_CUSTOMERS)
            print(c.PERCENT_DIFFERENCE)
            print(c.DAYS_DIFFERENCE)
            print("=====")
            print()

    else:

        # could not find a match, remove it from future processing
        _processed_customers.append(c.CUSTOMER)

    if to_save_counter>=save_every_n:

        _matching_process_log_time.append(str(pd.Timestamp.now()))
        _matching_process_log_event.append("writing datasets to snowflake")
        do_save_log(_matching_process_log_time, _matching_process_log_event)

        do_save_direct_matches(_direct_customer, _direct_match, _direct_draw_up_date)
        do_save_multiple_matches(_multiple_customer, _multiple_matches, _multiple_drop_dates)

        _matching_process_log_time.append(str(pd.Timestamp.now()))
        _matching_process_log_event.append("saved " + str(to_save_counter) + " records to snowflake.")
        do_save_log(_matching_process_log_time, _matching_process_log_event)

        to_save_counter = 0
        
    t1 = time.time()
    
    avg_duration = (((t1-t0)/idx)/60.0)
    
    if idx % print_every_n == 0:
        idx_remaining = len(_customers)-idx
        print("processing", idx, "current record:", c.CUSTOMER, ",", idx_remaining, "remaining")
        print(round(avg_duration,2), "avg mins per iteration",  round(avg_duration*idx_remaining,2), "estimated minutes remaining")
        print()

1064 filtered down rows
3634 filtered up rows
1 rows successfully written (PW61EN7iaX)

 processing range from 55 to 45 processing 1064 Draw Down Customers

DIRECT
B & T VIVIAN INVESTMENT CO LLC ['B', 'T', 'VIVIAN']
['B  T VIVIAN INVESTMENT CO LLC']
[0.15]
[1.0]
=====


MULTIPLE
B 3 BACKFLOW INC ['B', '3', 'BACKFLOW']
['B3 BACKFLOW', 'BAILEY PLUMBING AND BACKFLOW SERVICE']
[0.5, 0.0]
[1.0, 2.0]
=====

processing 25 current record: B A BLACKTOP LTD , 1039 remaining
0.02 avg mins per iteration 24.26 estimated minutes remaining

processing 50 current record: B L LAW CONSTRUCTION CO INC , 1014 remaining
0.02 avg mins per iteration 22.85 estimated minutes remaining

processing 75 current record: BA RESOURCE DEVELOPMENT CORP , 989 remaining
0.02 avg mins per iteration 22.02 estimated minutes remaining


DIRECT
BABY BOSTAS ['BABY', 'BOSTAS']
['BABY PLZ LLC']
[0.4]
[1.0]
=====


MULTIPLE
BABY JEWEL LLC ['BABY', 'JEWEL']
['BABY BEARS DAYCARE LLC', 'BAG BABY LLC']
[0.0, 0.0]
[2.0, 4.0]
=====


M

MULTIPLE
BEL FANM BOUTIQUE INC ['BEL', 'FANM']
['BEL AIR CONDITIONING  HE', 'BEL AIR LOGISTICS LLC']
[0.4, 0.0]
[2.0, 0.0]
=====


MULTIPLE
BEL RIVE CONST INC ['BEL', 'RIVE']
['BEL AIR CONDITIONING  HE', 'BEL AIR LOGISTICS LLC']
[0.4, 0.0]
[0.0, 2.0]
=====

processing 300 current record: BELL AIR CONDITIONING LLC , 764 remaining
0.02 avg mins per iteration 17.23 estimated minutes remaining


DIRECT
BELL'S FAMILY MULTISERVICE LLC ['BELLS', 'MULTISERVICE']
['BURNEY MULTISERVICE LLC']
[0.0]
[3.0]
=====


DIRECT
BELLAS LITTLE PRINCESS PARTY BOUTIQUE LLC ['BELLAS', 'PRINCESS']
["BELLA'S ICE CREAM LLC"]
[0.0]
[1.0]
=====


MULTIPLE
BELLE EN ROSE BOUTIQUE LLC ['BELLE', 'EN']
['BALADAIR ENVOLEE EN MONTGOLFIERES INC', 'BELLE MEADOWS BAPTIST CHURCH INC']
[0.0, 0.0]
[3.0, 4.0]
=====


DIRECT
BELLE MA FORME LLC ['BELLE', 'FORME']
['BELLE LIFESTYLE SERVICE LLC']
[0.0]
[2.0]
=====


DIRECT
BELLE SANTE FAMILY PRACTICE PLLC ['BELLE', 'SANTE', 'PRACTICE']
['BELLE LIFESTYLE SERVICE LLC']
[0.0]
[1.0]
===


DIRECT
BLEDSOE COUNTY SOLID WASTE ['BLEDSOE']
['BLEDSOE ENGINEERING  SURVEYING LLC']
[0.0]
[4.0]
=====


DIRECT
BLESS MY TRANSIT LLC ['BLESS']
['BLESS CARE TRANSPORTATION']
[0.4]
[3.0]
=====


MULTIPLE
BLESSED BEING LLC ['BLESSED', 'BEING']
['BLESSED ASSURANCE LLC', 'BLESSED BETS LLC', 'BLESSED HANDS ASSISTED LIVING LLC', 'BLESSED HANDS PAINTING LLC', 'BLESSED HANDS TRANSPORTATION LLC', 'BLESSED LANDINGS LLC']
[0.29, 0.22, 0.29, 0.29, 0.22, 0.29]
[1.0, 2.0, 1.0, 1.0, 1.0, 1.0]
=====


DIRECT
BLEU DETAILING ['BLEU']
['BLEU LOTUS LLC']
[0.0]
[1.0]
=====


DIRECT
BLEU MOON INVESTMENT GROUP LLC ['BLEU']
['BLEU DOOR DESIGN']
[0.0]
[2.0]
=====


MULTIPLE
BLEU ORGANIC LLC ['BLEU', 'ORGANIC']
['BLEU LOTUS LLC', 'BLEU NATURAL LLC']
[0.4, 0.29]
[1.0, 2.0]
=====


DIRECT
BLEU REALTY LLC ['BLEU']
['BLEU DOOR DESIGN']
[0.0]
[2.0]
=====

processing 550 current record: BLEU REALTY LLC , 514 remaining
0.02 avg mins per iteration 11.64 estimated minutes remaining


MULTIPLE
BLISS BLOOM LLC ['BLISS', '

processing 725 current record: BOTANICA FLOWERS AND GIFTS LLC , 339 remaining
0.02 avg mins per iteration 7.7 estimated minutes remaining


DIRECT
BOTTICELLI PLUMBING  HEATING ['BOTTICELLI']
['BOTTICELLI PLUMBING  HEATING LLC']
[0.4]
[0.0]
=====


DIRECT
BOUJEE BESTIES BEAUTY LLC ['BOUJEE', 'BESTIES']
['BAGZ N BOUJEE LLC']
[0.0]
[0.0]
=====


MULTIPLE
BOUJEE BOXX LLC ['BOUJEE', 'BOXX']
['BAD  BOUJEE BEAUTY BAR', 'BAD AND BOUJEE APPAREL FOR PETS LLC', 'BIG BOXX TRUCKING']
[0.0, 0.0, 0.4]
[0.0, 1.0, 1.0]
=====


DIRECT
BOWEN LANDSCAPE SERVICES ['BOWEN']
['BOWEN PAINTING  REMOLDING']
[0.18]
[1.0]
=====


MULTIPLE
BOWERS AUTO LLC ['BOWERS']
['BOWERS COLLISION  REFINISHING', 'BOWERS SALES  RENTALS LLC']
[0.5, 0.4]
[0.0, 1.0]
=====


DIRECT
BOYDS LANDSCAPE CREATIONS INC ['BOYDS']
['BOYDS AUTOMOTIVE LLC']
[0.0]
[4.0]
=====


DIRECT
BOYER FORD LINCOLN (BOBCAYGEON) LTD ['BOYER', 'BOBCAYGEON']
['BOYER FORD LINCOLN BOBCAYGEON LTD']
[0.22]
[1.0]
=====


DIRECT
BOYKINS LOGISTICS AND FREIGHT BROKERI