In [29]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
CALCULATED_CARD_DRAW_UPS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS_FULL")
CALCULATED_CARD_DRAW_UPS_FULL_df = CALCULATED_CARD_DRAW_UPS_FULL.get_dataframe()

CALCULATED_CARD_DRAW_DOWNS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_DOWNS_FULL")
CALCULATED_CARD_DRAW_DOWNS_FULL_df = CALCULATED_CARD_DRAW_DOWNS_FULL.get_dataframe()

COMMON_WORDS = dataiku.Dataset("NAFCUSTOMER_COMMON_WORDS_IN_NAMES")
COMMON_WORDS_df = COMMON_WORDS.get_dataframe()

In [32]:
df_down = CALCULATED_CARD_DRAW_DOWNS_FULL_df
df_up = CALCULATED_CARD_DRAW_UPS_FULL_df
df_common = COMMON_WORDS_df
_common_words = df_common.WORD.unique()

print(len(df_down))
print(len(df_up))
print(len(_common_words), "screening against common words")

182341
351542
2173 screening against common words


In [44]:
import string
import time
import difflib
from fuzzywuzzy import fuzz

def remove_common_words(input_string):
    
    c_str = input_string.translate(str.maketrans('', '', string.punctuation))
    f = c_str.split()
    
    c_clean = ""
    for w in f:
        if not w in _common_words:
            if len(c_clean)>0:
                c_clean += " "
            c_clean += w
            
    return(c_clean)

t0 = time.time()

df_up = CALCULATED_CARD_DRAW_UPS_FULL_df
df_up['CUSTOMER_CLEANED'] = df_up['CUSTOMER'].apply(remove_common_words)
df_up.head()

t1 = time.time()
avg_duration = ((t1-t0))
print("remove common words:", round(avg_duration,2), "seconds")

customers_ = ['AMGEN USA INC', 'OMEROS CORP PO#100752', 'BRITE LINE ASPHALT MAINTENANCE', 'MOSS FARMS (04)(2)',
              'JAMES H COWAN & ASSOC INC','WATTS EQUIPMENT CO INC','CUIVRE RIVER ELECT','BENTONS EQUIPMENT & CONSTRUCTI',
             'MILLENNIUM PHARMA','CONSTELLATION BRANDS (3CRW)']

_cut_off = [0.95, 0.90, 0.8, 0.7, 0.6]

for c in customers_:

    c_clean = remove_common_words(c)
    print('processing:', c, "[", c_clean, "]")
    print()
    
    t0 = time.time()
    for co in _cut_off:

        print("cut off", co)
        matches = difflib.get_close_matches(c_clean, df_up['CUSTOMER_CLEANED'].unique(), n=300, cutoff=co)

        if len(matches)>0:
            t1 = time.time()
            avg_duration = ((t1-t0))
            print(len(matches), "matches", avg_duration)
            print()
            print(matches)
        else:
            print("no matches", avg_duration)
        print()
        
    print()
    print("---")
    print()


remove common words: 44.24
processing: AMGEN USA INC [ AMGEN ]

cut off 0.95
1 matches 0.2221074104309082

['AMGEN']

cut off 0.9
1 matches 0.4831233024597168

['AMGEN']

cut off 0.8
18 matches 0.8085153102874756

['AMGEN', 'AMEN', 'AGEN', 'KAMAGEN', 'RAMEN', 'PAGEN', 'MEGEN', 'MAGEN', 'HAGEN', 'CAMEN', 'AXMEN', 'ARMEN', 'ARGEN', 'AMENT', 'AMEIN', 'AMDEN', 'AGENT', 'AGENC']

cut off 0.7
55 matches 1.2370011806488037

['AMGEN', 'AMEN', 'AGEN', 'KAMAGEN', 'RAMEN', 'PAGEN', 'MEGEN', 'MAGEN', 'HAGEN', 'CAMEN', 'AXMEN', 'ARMEN', 'ARGEN', 'AMENT', 'AMEIN', 'AMDEN', 'AGENT', 'AGENC', 'MALMGREN', 'MGE', 'MEN', 'AMN', 'AMG', 'AME', 'AGN', 'AEN', 'SAMENA', 'RAMAGE', 'PALMEN', 'NATGEN', 'LAYMEN', 'HAUGEN', 'HAMMEN', 'HAMDEN', 'GAMEAN', 'DAMIEN', 'DAMENU', 'DAMAGE', 'CARMEN', 'CAPMEN', 'CAMDEN', 'ARGENT', 'AMSDEN', 'AMGINE', 'AMEXON', 'AMERON', 'AMERCN', 'AMENTI', 'AMAZEN', 'AGREEN', 'AGENUS', 'AGENTS', 'AGENTE', 'AGENDA', 'ACUMEN']

cut off 0.6
300 matches 1.790696144104004

['AMGEN', 'AMEN', 'AG

no matches 1.798708200454712

cut off 0.9
no matches 1.798708200454712

cut off 0.8
no matches 1.798708200454712

cut off 0.7
1 matches 1.5682687759399414

['MOSS H']

cut off 0.6
19 matches 2.21702241897583

['MOSS H', 'SESECO 042', 'S D 042', 'MOSS', 'MASONS 502', 'H S 042', 'MIDWOOD 042', 'SMS 0BW4', 'O C S 02', 'MOSSJ215', 'LIMOS 24', 'HSS 0G92', 'COST 452', 'SSP 2', 'MOSSY', 'MOSSI', 'MOSES', 'L 042', 'MONTROSE 432']


---

processing: JAMES H COWAN & ASSOC INC [ H COWAN ]

cut off 0.95
1 matches 0.24416041374206543

['H COWAN']

cut off 0.9
1 matches 0.571366548538208

['H COWAN']

cut off 0.8
2 matches 0.9972999095916748

['H COWAN', 'COWAN']

cut off 0.7
13 matches 1.5353586673736572

['H COWAN', 'COWAN', 'CHOWAN', 'H COOPMAN', 'SHOWMAN', 'N ROWAN', 'MCGOWAN', 'MCCOWAN', 'HOWLAND', 'H NOLAN', 'COWCANS', 'H L BOWMAN', 'H C OSWALD']

cut off 0.6
139 matches 2.2176053524017334

['H COWAN', 'COWAN', 'CHOWAN', 'H COOPMAN', 'SHOWMAN', 'N ROWAN', 'MCGOWAN', 'MCCOWAN', 'HOWLAND', 'H NO

206 matches 2.6171703338623047

['DENISON CONSTRUCTIO', 'SANTOS CONTRUCTION', 'BERTYSCONSTRUCTION', 'SONSCONSTRUCTION', 'ORLANDOS CONSTRUCTI', 'MANSTON CONSTUCTION', 'F MORTON CONSTRUCTI', 'BENITEZ COSTRUCTION', 'ENCONSTRUCTION', 'BARRIENTOSCONSTRUCTIONLLC', 'YENLO CONSTRUCTON', 'ANELLO CONSTRUCTI', 'VALBEN CONSTRUCTRION', 'FULTON CONSTRUCTION3', 'MAYO CONSTRUCTI', 'ERSCONSTRUCTION', 'WARBURTON CONSTRUC', 'TKS RECONSTRUCTION', 'RIOS CONSTRUCXTION', 'RAINES CONSTRUCTIO', 'NETTLES CONSTRUCTN', 'MARBETH CONSTRUCTN', 'EXODUSCONSTRUCTION', 'CAMBRO CONSTRUCTIN', 'CALMES CONSTRUCTIO', 'JEROME CANNON CONSTRUCTI', 'CONSTRUCTI', 'TESC ONSTRUCTION', 'LYON CONSTRUCTON', 'HORSLEY CONSTRUCTIO', 'GEPHART CONSTRUCTIO', 'CLIFTON CONSTUCTION', 'CAETANO CONTRUCTION', 'YANONS FLOORSCONSTRUCTION', 'SACONSTRUCT', 'RECONSTRUCT', 'CONSTRUCTIO', 'RECONSTRUCTION', 'DECONSTRUCTION', 'BHCONSTRUCTION', 'TWOX CONSTRUCTERS', 'STOLL CONSTUCTION', 'STIMSON CONSTRCTN', 'SLOAN CONSTRUCTIO', 'SJL CCONSTRUCTION', 'LANDSCO

no matches 2.0427043437957764

cut off 0.9
no matches 2.0427043437957764

cut off 0.8
3 matches 1.046391487121582

['CONSTELLATION BRANDS3CRW', 'CONSTELLATION', 'CONSTELLATION PAI']

cut off 0.7
4 matches 1.647874355316162

['CONSTELLATION BRANDS3CRW', 'CONSTELLATION', 'CONSTELLATION PAI', 'CORINSTALLATION']

cut off 0.6
47 matches 2.8528923988342285

['CONSTELLATION BRANDS3CRW', 'CONSTELLATION', 'CONSTELLATION PAI', 'CORINSTALLATION', 'CONSTELLATION CULINA 5FV6', 'CONSTELLATION BRANDS3 CBL', 'CONSOLATION', 'INSTULLATION', 'CONSULTATION', 'CONGREGATION CHARI', 'CONSULTATION AMPERE', 'CONGREGATION HEBREW', 'CONSULTATIONS', 'CONSTANTINO S', 'CONSOLIDATION', 'STERILIZATION VIR', 'SMINSTALLATION', 'OM INTALLATION', 'DINSTALLATIONS', 'CONSTUCTION SF', 'CONSTRUCTIONNW', 'CONFECTION GCC', 'STEMULATION', 'COUNSELLING', 'CONSTUCTION', 'CONSTRUTION', 'CONSTANTINE', 'CONSERVATIO', 'CONCREATION', 'COASTLINE W', 'CELLZNATION', 'LONESTAR RELOCATION SRVC', 'COSTELLO', 'CONTAINR', 'OPERATION CROWTHER'

In [0]:
# Compute recipe outputs
# TODO: Write here your actual code that computes the outputs
# NB: DSS supports several kinds of APIs for reading and writing data. Please see doc.

#DRAW_UP_DOWN_ANALYSIS_df = ... # Compute a Pandas dataframe to write into DRAW_UP_DOWN_ANALYSIS

# Write recipe outputs
#DRAW_UP_DOWN_ANALYSIS = dataiku.Dataset("DRAW_UP_DOWN_ANALYSIS")
#DRAW_UP_DOWN_ANALYSIS.write_with_schema(DRAW_UP_DOWN_ANALYSIS_df)