In [29]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
CALCULATED_CARD_DRAW_UPS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS_FULL")
CALCULATED_CARD_DRAW_UPS_FULL_df = CALCULATED_CARD_DRAW_UPS_FULL.get_dataframe()

CALCULATED_CARD_DRAW_DOWNS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_DOWNS_FULL")
CALCULATED_CARD_DRAW_DOWNS_FULL_df = CALCULATED_CARD_DRAW_DOWNS_FULL.get_dataframe()

COMMON_WORDS = dataiku.Dataset("NAFCUSTOMER_COMMON_WORDS_IN_NAMES")
COMMON_WORDS_df = COMMON_WORDS.get_dataframe()

In [32]:
df_down = CALCULATED_CARD_DRAW_DOWNS_FULL_df
df_up = CALCULATED_CARD_DRAW_UPS_FULL_df
df_common = COMMON_WORDS_df
_common_words = df_common.WORD.unique()

print(len(df_down))
print(len(df_up))
print(len(_common_words), "screening against common words")

182341
351542
2173 screening against common words


In [43]:
import string
import time
import difflib
from fuzzywuzzy import fuzz

def remove_common_words(input_string):
    
    c_str = input_string.translate(str.maketrans('', '', string.punctuation))
    f = c_str.split()
    
    c_clean = ""
    for w in f:
        if not w in _common_words:
            if len(c_clean)>0:
                c_clean += " "
            c_clean += w
            
    return(c_clean)

t0 = time.time()

df_up = CALCULATED_CARD_DRAW_UPS_FULL_df
df_up['CUSTOMER_CLEANED'] = df_up['CUSTOMER'].apply(remove_common_words)
df_up.head()

t1 = time.time()
avg_duration = ((t1-t0))
print("remove common words:", avg_duration)

customers_ = ['AMGEN USA INC', 'OMEROS CORP PO#100752', 'BRITE LINE ASPHALT MAINTENANCE', 'MOSS FARMS (04)(2)',
              'JAMES H COWAN & ASSOC INC','WATTS EQUIPMENT CO INC','CUIVRE RIVER ELECT','BENTONS EQUIPMENT & CONSTRUCTI',
             'MILLENNIUM PHARMA','CONSTELLATION BRANDS (3CRW)']

_cut_off = [0.95, 0.90, 0.8, 0.7, 0.6]

for c in customers_:

    c_clean = remove_common_words(c)
    print('processing:', c, "[", c_clean, "]")
    
    t0 = time.time()
    for co in _cut_off:

        print("cut off", co)
        matches = difflib.get_close_matches(c_clean, df_up['CUSTOMER_CLEANED'].unique(), n=300, cutoff=co)

        if len(matches)>0:
            t1 = time.time()
            avg_duration = ((t1-t0))
            print(len(matches), "matches", avg_duration)
            print()
            print(matches)
        else:
            print("no matches", avg_duration)
        print()
        
    print()
    print("---")
    print()


remove common words: 44.29470086097717
processing: AMGEN USA INC [ AMGEN ]
cut off 0.95
no matches 44.29470086097717

cut off 0.9
no matches 44.29470086097717

cut off 0.8
6 matches 2.112257242202759

['AGENDA USA INC', 'MENS USA INC', 'AGENUS INC', 'WAM USA INC', 'AMGEN USA', 'HERGEN USA INC']

cut off 0.7
72 matches 3.8523128032684326

['AGENDA USA INC', 'MENS USA INC', 'AGENUS INC', 'WAM USA INC', 'AMGEN USA', 'HERGEN USA INC', 'CARMEN CASA INC', 'BEIGENE USA INC', 'TENUSA INC', 'AGENTS INC', 'SANAC USA INC', 'GAMMA USA INC', 'AUGMENTUS INC', 'AREAS USA INC', 'MORTGAGE USA INC', 'GIISAMEX USA INC', 'AMEC USINAGE INC', 'XEL USA INC', 'TES USA INC', 'TDM USA INC', 'OEC USA INC', 'NMB USA INC', 'MBA USA INC', 'MA MUSA INC', 'HNH USA INC', 'AERO US INC', 'SAITEX USA INC', 'MORGENSTAR INC', 'MINOVA USA INC', 'ENSITE USA INC', 'CRUZAN USA INC', 'AXMEN AUTO INC', 'ATTENTI US INC', 'SHAWALMEX USA INC', 'GRAMAZINI USA INC', 'CAR AMIGO USA INC', 'NAMSA INC', 'FAMSA INC', 'CAMEN INC', 'ANURA I

KeyboardInterrupt: 

In [0]:
# Compute recipe outputs
# TODO: Write here your actual code that computes the outputs
# NB: DSS supports several kinds of APIs for reading and writing data. Please see doc.

#DRAW_UP_DOWN_ANALYSIS_df = ... # Compute a Pandas dataframe to write into DRAW_UP_DOWN_ANALYSIS

# Write recipe outputs
#DRAW_UP_DOWN_ANALYSIS = dataiku.Dataset("DRAW_UP_DOWN_ANALYSIS")
#DRAW_UP_DOWN_ANALYSIS.write_with_schema(DRAW_UP_DOWN_ANALYSIS_df)