In [83]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
CALCULATED_CARD_DRAW_UPS = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS")
CALCULATED_CARD_DRAW_UPS_df = CALCULATED_CARD_DRAW_UPS.get_dataframe()

CALCULATED_CARD_DRAW_UPS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS_FULL")
CALCULATED_CARD_DRAW_UPS_FULL_df = CALCULATED_CARD_DRAW_UPS_FULL.get_dataframe()

CALCULATED_CARD_DRAW_DOWNS = dataiku.Dataset("CALCULATED_CARD_DRAW_DOWNS")
CALCULATED_CARD_DRAW_DOWNS_df = CALCULATED_CARD_DRAW_DOWNS.get_dataframe()

CALCULATED_CARD_DRAW_DOWNS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_DOWNS_FULL")
CALCULATED_CARD_DRAW_DOWNS_FULL_df = CALCULATED_CARD_DRAW_DOWNS_FULL.get_dataframe()

COMMON_WORDS = dataiku.Dataset("NAFCUSTOMER_COMMON_WORDS_IN_NAMES")
COMMON_WORDS_df = COMMON_WORDS.get_dataframe()

In [84]:
df_down = CALCULATED_CARD_DRAW_DOWNS_df
df_down_full = CALCULATED_CARD_DRAW_DOWNS_FULL_df
df_up = CALCULATED_CARD_DRAW_UPS_df
df_up_full = CALCULATED_CARD_DRAW_UPS_FULL_df
df_common = COMMON_WORDS_df

df_down.sort_values(['CUSTOMER'], inplace=True)
df_down_full.sort_values(['CUSTOMER'], inplace=True)
df_up.sort_values(['CUSTOMER'], inplace=True)
df_up_full.sort_values(['CUSTOMER'], inplace=True)
df_common.sort_values(['WORD'], inplace=True)

print(len(df_down), "draw downs")
print(len(df_down_full), "draw downs full")
print(len(df_up), "draw ups")
print(len(df_up_full), "draw ups full")
print(len(df_common), "common words")

194 draw downs
182341 draw downs full
322 draw ups
351542 draw ups full
2173 common words


In [85]:
df_down.head()
df_common.head()

Unnamed: 0,WORD,COUNTS
1459,1ST,337
923,5R82,192
1725,A1,517
1194,AA,244
1372,AAA,298


In [86]:
import string

_common_words = df_common.WORD.unique()
print(len(_common_words), "screening against common words")

class Draw_Down_Customer:

    def __init__(self, name, draw_down_date, mean_dd, std_dd, active_card_max):

        self.CUSTOMER = name
        self.DRAW_DOWN_DATE = draw_down_date
        self.ACTIVE_CARD_MAX = active_card_max

        self.MATCHING_CUSTOMERS = []
        self.PERCENT_DIFFERENCE = []
        self.DAYS_DIFFERENCE = []
        self.DRAW_UP_DATE = []

        # remove punctuation
        c_str = name.translate(str.maketrans('', '', string.punctuation))

        f = c_str.split()
        self.WORD_LIST = []
        for w in f:
            if w not in _common_words:
                self.WORD_LIST.append(w)

    def Match_Draw_Up_Customer(self, name, draw_up_date, mean_du, std_du, active_card_max):

        if (self.CUSTOMER == name):
            # exact match, already captured
            return

        c_str = name.translate(str.maketrans('', '', string.punctuation))

        f = c_str.split()

        check_list = []
        for w in f:
            if (w not in _common_words) and (len(w)>1) and (not w.isnumeric()):
                check_list.append(w)

        percent_diff = round((abs(self.ACTIVE_CARD_MAX - active_card_max) / ((self.ACTIVE_CARD_MAX+active_card_max)/2)),2)

        #date_format = "%Y-%m-%d"
        #d1_date = datetime.strptime(draw_up_date.astype(str), date_format)
        #d2_date = datetime.strptime(self.DRAW_DOWN_DATE.astype(str), date_format)

        delta_between_drop_and_rise = round(abs((draw_up_date-self.DRAW_DOWN_DATE).days)/30.,0)

        for w_to_check in check_list:
            for w in self.WORD_LIST:
                if w_to_check==w:

                    if not name in(self.MATCHING_CUSTOMERS) and(delta_between_drop_and_rise<=4)and(percent_diff<=0.5) :
                        self.MATCHING_CUSTOMERS.append(name)
                        self.PERCENT_DIFFERENCE.append(percent_diff)
                        self.DAYS_DIFFERENCE.append(delta_between_drop_and_rise)
                        self.DRAW_UP_DATE.append(draw_up_date)
                        break;

2173 screening against common words


In [0]:
idx = 0
max_idx = 0
_processed_customers = []
verbose = True

process_ranges = [[100000,1000],[1100,900],[1000,600],[700,300],[400,100],[150,70],[100,40]]

for r in process_ranges:

    r_max = r[0]
    r_min = r[1]

    df_down = df_down_full[(df_down_full.ACTIVE_CARD_MAX<=r_max)&(df_down_full.ACTIVE_CARD_MAX>=r_min)]
    df_up = df_up_full[(df_up_full.ACTIVE_CARD_MAX<=r_max)&(df_up_full.ACTIVE_CARD_MAX>=r_min)]

    print(len(df_down), "filtered down rows")
    print(len(df_up), "filtered up rows")

    max_idx = 1000

    _customers = []
    # Prepare Customer Set
    for index, row in df_down.iterrows():

        idx+=1

        customer = row['CUSTOMER']
        draw_down_date = row['DRAW_DOWN_DATE']
        mean_dd = row['MEAN_DD']
        std_dd = row['STD_DD']
        active_card_max = row['ACTIVE_CARD_MAX']

        c = Draw_Down_Customer(customer, draw_down_date, mean_dd, std_dd, active_card_max)

        _customers.append(c)

    print(idx)

    ##
    if verbose:
        print(" processing range from " + str(r_max) + " to " + str(r_min) + " " + str(len(_customers)) + " Draw Down Customers")

    idx = 0

    _matching_process_log_time = []
    _matching_process_log_event = []
    _matching_process_log_time.append(str(pd.Timestamp.now()))
    _matching_process_log_event.append(" processing range from " + str(r_max) + " to " + str(r_min) + " " + str(len(_customers)) + " Draw Down Customers")

    _direct_customer = []
    _direct_match = []
    _direct_draw_up_date = []

    _multiple_customer = []
    _multiple_matches = []
    _multiple_drop_dates = []

    _no_match_customer = []
    _no_match_draw_down_date = []

    for c in _customers:

        for index_up, row_up in df_up.iterrows():

            idx+=1

            customer = row_up['CUSTOMER']
            draw_up_date = row_up['DRAW_UP_DATE']
            mean_du = row_up['MEAN_DU']
            std_du = row_up['STD_DU']
            active_card_max = row_up['ACTIVE_CARD_MAX']

            c.Match_Draw_Up_Customer(customer, draw_up_date, mean_du, std_du, active_card_max)

        if len(c.MATCHING_CUSTOMERS)==1:

            if not c.CUSTOMER in (_processed_customers):
                _direct_customer.append(c.CUSTOMER)
                _processed_customers.append(c.CUSTOMER)
                _direct_match.append(c.MATCHING_CUSTOMERS[0])
                _processed_customers.append(c.MATCHING_CUSTOMERS[0])
                _direct_draw_up_date.append(c.DRAW_UP_DATE[0])

                if verbose:
                    print()
                    print("DIRECT")
                    print(c.CUSTOMER, c.WORD_LIST)
                    print(c.MATCHING_CUSTOMERS)
                    print(c.PERCENT_DIFFERENCE)
                    print(c.DAYS_DIFFERENCE)
                    print("=====")
                    print()

        elif len(c.MATCHING_CUSTOMERS)>1:

            if not c.CUSTOMER in (_processed_customers):
                _multiple_customer.append(c.CUSTOMER)
                _processed_customers.append(c.CUSTOMER)
                _multiple_matches.append(c.MATCHING_CUSTOMERS)
                _multiple_drop_dates.append(c.DRAW_UP_DATE)

            if verbose:
                print()
                print("MULTIPLE")
                print(c.CUSTOMER, c.WORD_LIST)
                print(c.MATCHING_CUSTOMERS)
                print(c.PERCENT_DIFFERENCE)
                print(c.DAYS_DIFFERENCE)
                print("=====")
                print()
        else:
            _no_match_customer.append(c.CUSTOMER)
            _no_match_draw_down_date.append(c.DRAW_DOWN_DATE)

    print(idx)
    print()

    _matching_process_log_time.append(str(pd.Timestamp.now()))
    _matching_process_log_event.append("writing datasets to snowflake")

    df_matches = pd.DataFrame(_direct_customer)
    if len(df_matches)>0:
        df_matches.columns = ['CUSTOMER']
        df_matches["MATCH_CUSTOMER"] = _direct_match
        df_matches["DRAW_UP_DATE"] = _direct_draw_up_date

    df_multiple_matches = pd.DataFrame(_multiple_customer)
    if len(df_multiple_matches)>0:
        df_multiple_matches.columns = ['CUSTOMER']
        df_multiple_matches["MATCH_CUSTOMER"] = _multiple_matches
        df_multiple_matches["DRAW_UP_DATE"] = _multiple_drop_dates

    df_no_matches = pd.DataFrame(_no_match_customer)
    df_no_matches.columns = ['CUSTOMER']
    df_no_matches['DRAW_DOWN_DATE'] = _no_match_draw_down_date

    df_matching_log = pd.DataFrame(_matching_process_log_time)
    df_matching_log.columns = ['LOG_TIME']
    df_matching_log['LOG_EVENT'] = _matching_process_log_event

    if len(df_multiple_matches)>0:
        MATCHES_1_TO_N_FOR_MANUAL_REVIEW_df = df_multiple_matches
        MATCHES_1_TO_N_FOR_MANUAL_REVIEW = dataiku.Dataset("MATCHES_1_TO_N_FOR_MANUAL_REVIEW")
        MATCHES_1_TO_N_FOR_MANUAL_REVIEW.write_with_schema(MATCHES_1_TO_N_FOR_MANUAL_REVIEW_df)

    if len(df_matches)>0:
        MATCHES_1_TO_1_df = df_matches
        MATCHES_1_TO_1 = dataiku.Dataset("MATCHES_1_TO_1")
        MATCHES_1_TO_1.write_with_schema(MATCHES_1_TO_1_df)

    MATCHES_1_TO_NONE_df = df_no_matches
    MATCHES_1_TO_NONE = dataiku.Dataset("MATCHES_1_TO_NONE")
    MATCHES_1_TO_NONE.write_with_schema(MATCHES_1_TO_NONE_df)

    MATCHING_PROCESS_LOG_df = df_matching_log
    MATCHING_PROCESS_LOG = dataiku.Dataset("MATCHING_PROCESS_LOG")
    MATCHING_PROCESS_LOG.write_with_schema(MATCHING_PROCESS_LOG_df)

84 filtered down rows
122 filtered up rows
84
 processing range from 100000 to 1000 84 Draw Down Customers

DIRECT
AMGEN USA INC ['AMGEN']
['AMGEN USA']
[0.19]
[2.0]
=====


DIRECT
ANADARKO PETROLEUM CORP (0G72) ['ANADARKO', '0G72']
['ANADARKO PETROLEUM 0G72']
[0.18]
[1.0]
=====


DIRECT
BELFOR (2M22) ['BELFOR', '2M22']
['BELFOR USA GROUP 2M22']
[0.0]
[2.0]
=====


DIRECT
BLACK HILLS ENERGY (5BB5) ['5BB5']
['BH EXPLORATION  PRO 5BB5']
[0.23]
[1.0]
=====


DIRECT
CONSUMERS ENERGY ['CONSUMERS']
['CONSUMERS ENERGY PO 4400092713']
[0.03]
[1.0]
=====


DIRECT
CSX ['CSX']
['CSX TRANSPORTATION 5AK0']
[0.01]
[2.0]
=====


DIRECT
NATIONAL MENTOR HOLD 5K09 ['MENTOR', 'HOLD', '5K09']
['0478006606370 - MENTOR NETWORK']
[0.45]
[1.0]
=====


DIRECT
SECURITAS SECURITY SRVCS(3SCR) ['SECURITAS', 'SRVCS3SCR']
['SECURITAS SECURITY SERVICES USA INC']
[0.05]
[1.0]
=====


DIRECT
SUNRUN SOUTH LLC (5EF5) ['SUNRUN', '5EF5']
['SUNRUN SOUTH']
[0.36]
[2.0]
=====


DIRECT
TAK REPORTING ['TAK', 'REPORTING']
['TAK 

LOUISIANA MACHINERY (5G85) ['5G85']
['LOUISIANA MACHINERY 5G85']
[0.16]
[1.0]
=====


DIRECT
MACDONALD-MILLER FS (2N69) ['MACDONALDMILLER', 'FS', '2N69']
['MACDONALDMILLER FAC 2N69']
[0.13]
[1.0]
=====


DIRECT
MARCO HOLDINGS (03) ['MARCO', '03']
['MARCO HOLDINGS 03']
[0.02]
[2.0]
=====


DIRECT
MATRIX (5AE7) ['5AE7']
['MATRIX NAC INC 5AE7']
[0.27]
[1.0]
=====


DIRECT
MCCOY CORPORATION ['MCCOY']
['MCCOY CORP']
[0.27]
[1.0]
=====


DIRECT
MCKINSTRY COMPANY(2A63) ['MCKINSTRY', 'COMPANY2A63']
['MCKINSTRY COMPANY 2A63']
[0.1]
[1.0]
=====


DIRECT
MEREDITH CORP. (5095) ['MEREDITH', '5095']
['MEREDITH CORPORATION 5095']
[0.05]
[1.0]
=====


DIRECT
MISSION LINEN SUPPLY (5AK4) ['5AK4']
['MISSION LINEN SUPPLY 5AK4']
[0.13]
[1.0]
=====


DIRECT
MOMENTUM SOLAR 0AZ7 ['MOMENTUM', '0AZ7']
['MOMENTUM SOLAR LLC']
[0.04]
[1.0]
=====


DIRECT
NYSARC NASSAU CHAPTER ['NYSARC', 'NASSAU', 'CHAPTER']
['AHRC NASSAU']
[0.11]
[3.0]
=====


DIRECT
OCONNELL ELECTRIC CO INC ['OCONNELL']
['OCONNELL ELECTRIC COMPAN