In [13]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
CALCULATED_CARD_DRAW_UPS = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS")
CALCULATED_CARD_DRAW_UPS_df = CALCULATED_CARD_DRAW_UPS.get_dataframe()

CALCULATED_CARD_DRAW_DOWNS = dataiku.Dataset("CALCULATED_CARD_DRAW_DOWNS")
CALCULATED_CARD_DRAW_DOWNS_df = CALCULATED_CARD_DRAW_DOWNS.get_dataframe()


In [15]:
df_down = CALCULATED_CARD_DRAW_DOWNS_df
df_up = CALCULATED_CARD_DRAW_UPS_df

df_down.sort_values(['CUSTOMER'], inplace=True)
df_up.sort_values(['CUSTOMER'], inplace=True)

print(len(df_down), "draw downs")
print(len(df_up), "draw ups")

194 draw downs
322 draw ups


In [11]:
df_down.head()

Unnamed: 0,CUSTOMER,DRAW_DOWN_DATE,MEAN_DD,STD_DD,ACTIVE_CARD_MAX
0,3LUM,2019-12-01 00:00:00+00:00,13.0,0.57735,14.0
193,3US6 (EQUIPMENT),2021-08-01 00:00:00+00:00,36.6,27.673092,104.0
192,ACORDA THERAPEUTICS,2019-09-01 00:00:00+00:00,129.6,2.19089,133.0
191,ADF PIZZA MANAGEMENT,2020-12-01 00:00:00+00:00,7.5,1.623688,12.0
190,ALENT,2020-04-01 00:00:00+00:00,1.333333,0.492366,2.0


In [56]:
import string

class Draw_Down_Customer:
    
    def __init__(self, name, draw_down_date, mean_dd, std_dd, active_card_max):

        self._common_words = ['PIZZA', 'MANAGEMENT', 'THERAPEUTICS', 'USA', 'INC', 'US', 'EQUIPMENT', 'MEDICAL', 'SYSTEMS',
                             'ANIMAL', 'HEALTH', 'LLC', 'CORPORATION', 'BRANDS', 'TIRE', 'RUBBER', 'COUNTRY', 'CORP', 
                              'PHARMACY','INC', 'RESTAURANTS', 'CONTAINER', 'AMERICA', 'APPLICATIONS', 'TECHNOLOGY', 
                              'INSURANCE', 'FARM','CREDIT', 'SERVICES', 'SERVICE', 'ACCOUNT', 'GENERAL', 'PARTS', 
                              'INTL', 'FLAVORS', 'HOLDINGS', 'FOOD','INDUSTRIES', 'LP', 'FLEET', 'MEDICAL', 'PHARMA',
                             'GLOBAL', 'PIPELINE', 'WHEELS', 'BIOSCIENCES', 'SSI', 'SPRINGS', 'NORTH', 'MARINE', 'HOLDING', 
                              'TECHNOLOGIES','GROUP', 'PHARMACEUTICAL', 'NA', 'USA', 'COMPANY', 'RAIL', 'PARTNERS', 'BROS', 
                              'CO', 'PHARMACEUTICALS', 'ENERGY', 'DISTRIBUTION', 'DENTAL', 'SPECIALTIES', 'OPERATIONS', 
                              'COMPANY', 'THE', 'MOUNTAIN', 'TRANS', 'FUEL', 'AMERICAN', 'HOMES', 'GAS']
        
        self.CUSTOMER = name
        self.DRAW_DOWN_DATE = draw_down_date
        self.ACTIVE_CARD_MAX = active_card_max
        
        self.MATCHING_CUSTOMERS = []
        self.PERCENT_DIFFERENCE = []
        self.DAYS_DIFFERENCE = []
        
        # remove punctuation
        c_str = name.translate(str.maketrans('', '', string.punctuation))
    
        f = c_str.split()
        self.WORD_LIST = []
        for w in f:
            if w not in self._common_words:
                self.WORD_LIST.append(w)
                
    def Match_Draw_Up_Customer(self, name, draw_up_date, mean_du, std_du, active_card_max):
        
        if (self.CUSTOMER == name):
            # exact match, already captured
            return
        
        c_str = name.translate(str.maketrans('', '', string.punctuation))
        
        f = c_str.split()
        
        check_list = []
        for w in f:
            if w not in self._common_words:
                check_list.append(w)
                
        percent_diff = round((abs(self.ACTIVE_CARD_MAX - active_card_max) / ((self.ACTIVE_CARD_MAX+active_card_max)/2)),2)

        #date_format = "%Y-%m-%d"
        #d1_date = datetime.strptime(draw_up_date.astype(str), date_format)
        #d2_date = datetime.strptime(self.DRAW_DOWN_DATE.astype(str), date_format)

        delta_between_drop_and_rise = round(abs((draw_up_date-self.DRAW_DOWN_DATE).days)/30.,0)
            
        for w_to_check in check_list:
            for w in self.WORD_LIST:
                if w_to_check==w:
                    
                    if not name in(self.MATCHING_CUSTOMERS) and(delta_between_drop_and_rise<=4)and(percent_diff<=0.5) :
                        self.MATCHING_CUSTOMERS.append(name)
                        self.PERCENT_DIFFERENCE.append(percent_diff)
                        self.DAYS_DIFFERENCE.append(delta_between_drop_and_rise)
                        break;

In [59]:
idx = 0

_customers = []

for index, row in df_down.iterrows():
    
    idx+=1 
    
    customer = row['CUSTOMER']
    draw_down_date = row['DRAW_DOWN_DATE']
    mean_dd = row['MEAN_DD']
    std_dd = row['STD_DD']
    active_card_max = row['ACTIVE_CARD_MAX']
    
    c = Draw_Down_Customer(customer, draw_down_date, mean_dd, std_dd, active_card_max)
    
    _customers.append(c)
    
    #if idx>10:
    #    break;
    
idx = 0
verbose = False


_matches = []
_multiple_matches = []

for c in _customers:
        
    for index_up, row_up in df_up.iterrows():
        
        idx+=1
        
        customer = row_up['CUSTOMER']
        draw_up_date = row_up['DRAW_UP_DATE']
        mean_du = row_up['MEAN_DU']
        std_du = row_up['STD_DU']
        active_card_max = row_up['ACTIVE_CARD_MAX']
        
        c.Match_Draw_Up_Customer(customer, draw_up_date, mean_du, std_du, active_card_max)
    
    if len(c.MATCHING_CUSTOMERS)==1:
        
        print(c.CUSTOMER, c.MATCHING_CUSTOMERS)
        
        if verbose:
            print(c.CUSTOMER, c.WORD_LIST)
            print("Draw Up Date:", c.DRAW_DOWN_DATE)
            print("Cards", c.ACTIVE_CARD_MAX)
            print()
            print(c.MATCHING_CUSTOMERS)
            print(c.PERCENT_DIFFERENCE)
            print(c.DAYS_DIFFERENCE)
            print()
            print("=====")
            print()
        
    elif len(c.MATCHING_CUSTOMERS)>1:
        print()
        print("deal with multiple matches")
        print()
        
print(idx)

ALKERMES (3ALK) ['ALKERMES', '3ALK']
Draw Up Date: 2022-01-01 00:00:00+00:00
Cards 349.0

['ALKERMES INC']
[0.25]
[2.0]

=====

AMERICAN HOMES 4 RENT ['4', 'RENT']
Draw Up Date: 2021-04-01 00:00:00+00:00
Cards 367.0

['AMERICAN HOMES 4 RENT LP']
[0.07]
[2.0]

=====

AMGEN USA INC ['AMGEN']
Draw Up Date: 2021-03-01 00:00:00+00:00
Cards 2933.0

['AMGEN USA']
[0.19]
[2.0]

=====

ARCADIS US ['ARCADIS']
Draw Up Date: 2022-02-01 00:00:00+00:00
Cards 348.0

['ARCADIS US INC']
[0.08]
[2.0]

=====

ARCHROMA U.S. INC. ['ARCHROMA']
Draw Up Date: 2021-07-01 00:00:00+00:00
Cards 29.0

['ARCHROMA US INC']
[0.27]
[1.0]

=====

ASTELLAS PHARMA ['ASTELLAS']
Draw Up Date: 2021-09-01 00:00:00+00:00
Cards 957.0

['ASTELLAS PHARMA US INC']
[0.04]
[2.0]

=====


deal with multiple matches

BALFOUR EQUIP ['BALFOUR', 'EQUIP']
Draw Up Date: 2021-08-01 00:00:00+00:00
Cards 113.0

['BALFOUR BEATTY3BBR']
[0.11]
[1.0]

=====

BALFOUR RENO ['BALFOUR', 'RENO']
Draw Up Date: 2021-06-01 00:00:00+00:00
Cards 24.0

['B

WATERS TECHNOLOGIES ['WATERS']
Draw Up Date: 2021-12-01 00:00:00+00:00
Cards 657.0

['WATERS TECHNOLOGIES CORPORATION']
[0.01]
[1.0]

=====

WATT STOPPER, INC. ['WATT', 'STOPPER']
Draw Up Date: 2021-08-01 00:00:00+00:00
Cards 65.0

['WATT STOPPER INC']
[0.39]
[1.0]

=====


deal with multiple matches

YUM  INC. ['YUM']
Draw Up Date: 2021-04-01 00:00:00+00:00
Cards 191.0

['YUM RESTAURANT SERVICES GROUP LLC']
[0.21]
[1.0]

=====

62468


In [24]:
len(df_up)

322

In [0]:

# Compute recipe outputs
# TODO: Write here your actual code that computes the outputs
# NB: DSS supports several kinds of APIs for reading and writing data. Please see doc.

#CONSOLIDATED_MATCHES_df = ... # Compute a Pandas dataframe to write into CONSOLIDATED_MATCHES


# Write recipe outputs
#CONSOLIDATED_MATCHES = dataiku.Dataset("CONSOLIDATED_MATCHES")
#CONSOLIDATED_MATCHES.write_with_schema(CONSOLIDATED_MATCHES_df)