In [1]:
# Python code to match additional customer lists to a master list
# need fuzzywuzzy installed
import pandas
import numpy
import sys

In [3]:
# basic input of csvs and print column names
# target is the file you want to match to your master
# target file can have multiple repeated customers but the master assumes all customers are unique
filename_target = '~/Documents/Projects/16_01_18_Graincorp/ParentInfo/SandL_final_matching.csv'
df_target = pandas.read_csv(filename_target, encoding='latin-1')
filename_master = '~/Documents/Projects/16_01_18_Graincorp/ParentInfo/GRAINCORP_MARKETINGOILS PARENT_CHECK.csv'
df_master = pandas.read_csv(filename_master, encoding='latin-1')

# filter out the non needed columns
df_master_filt = df_master.loc[:,('Customer ID', 'Customer Name', 'Root Customer ID','Root Customer Name')]
df_target_filt = df_target.loc[:,('Customer ID', 'Customer Name', 'Root Customer ID','Root Customer Name')]

# check of the output column names
print(df_target_filt.columns.values)
print(df_master_filt.columns.values)

['Customer ID' 'Customer Name' 'Root Customer ID' 'Root Customer Name']
['Customer ID' 'Customer Name' 'Root Customer ID' 'Root Customer Name']


In [4]:
# This tile does the matching. The process works as follows.
# 1. Remove all the generic words that may match in several customers name such as 'company'
# 2. Iterate over the target customer list and for each customer iterate over the master customer list.
# 3. Record the best matches based on a score produced by fuzzywuzzy and output it into a dataframe.
# The best matches fields are only filled if the match is higher than a certain score
from fuzzywuzzy import fuzz
# build matched dataframe
df_matched = pandas.DataFrame(columns=['Orig Cust Name','Orig Cust Name2','Customer Name',\
                                        'Customer Name2','Customer ID', 'Customer ID 2', \
                                       'Matched ID', 'Matched Name', 'Score', 'Daughter Cust', 'Daughter Cust ID'])

# remove upper case and generic words
# generic words have to picked manually
# set the original names before the replaces for reference later
df_target_filt['Orig Cust Name'] = df_target_filt['Root Customer Name']
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.lower()
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('ltd', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('pty', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('australia', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('limited', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('company', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('partnership', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('enterprises', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('solutions', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('trust', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('holdings', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('dairies', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('australasia', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('rural', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('supplies', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('tavern', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('hotel', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('dairy', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('products', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('transport', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('national', '')
df_target_filt['Root Customer Name'] = df_target_filt['Root Customer Name'].str.replace('new zealand', '')

df_master_filt['Orig Cust Name'] = df_master_filt['Root Customer Name']
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.lower()
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('ltd', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('pty', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('australia', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('limited', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('company', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('partnership', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('enterprises', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('solutions', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('trust', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('holdings', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('dairies', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('australasia', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('rural', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('supplies', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('tavern', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('hotel', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('dairy', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('products', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('transport', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('national', '')
df_master_filt['Root Customer Name'] = df_master_filt['Root Customer Name'].str.replace('new zealand', '')

# use this to check that the original customer names are not effected by the replace above
print(df_target_filt.head())
print(df_master_filt.head())

# iterate over the list
# minimum score for automatic matching set here
matchingLimit = 77
bestScore = 0
for i, row in df_target_filt.iterrows():
    # for each element in the list iterate over the copy to find matches
    for j, row2 in df_master_filt.iterrows():
        
        customerName = row['Root Customer Name']
        customerName2 = row2['Root Customer Name']
        # have to set the best match as the first match
        if j==0:
            bestMatchedName = customerName2
            bestMatchedId = row2['Root Customer ID']
            bestScore = fuzz.token_sort_ratio(customerName2, customerName)
            bestOrigCust = row2['Orig Cust Name']
        # update the match when the score is improved
        elif fuzz.token_sort_ratio(customerName2, customerName) > bestScore and \
             fuzz.token_sort_ratio(row['Root Customer ID'], row2['Root Customer ID']) !=100:
            bestScore = fuzz.token_sort_ratio(customerName2, customerName)
            bestMatchedName = customerName2
            bestMatchedId = row2['Root Customer ID']
            bestOrigCust = row2['Orig Cust Name']
            
    # This writes the match to the matching fields if the score is sufficient
    if bestScore > matchingLimit:
        matchedID = bestMatchedId
        matchedName = bestOrigCust
    else:
        matchedID = row['Root Customer ID']
        matchedName = row['Orig Cust Name']
    if i%100==0: print(i) # for progress update
    #if i>100: break # can be used in testing for just a few customer names

    # fill the matching dataframe
    df_matched.loc[i] = (row['Orig Cust Name'], bestOrigCust, customerName, bestMatchedName, \
                         row['Root Customer ID'], bestMatchedId, matchedID, matchedName, \
                         bestScore, row['Customer Name'], row['Customer ID'])


df_matched.to_csv("TEST_customer_matching.csv")

# this outputs a csv file that you can then manually check. You can 'correct' matches by
# putting the Matched id column as the original Customer Id if it is an incorrect match.
# Do the same with the matched and customer names

   Customer ID                   Customer Name  Root Customer ID  \
0        47755         3K Transport & Trading              47755   
1         1951                   A & B GRAINS               1951   
2     13348663                     A & E Moon           13348663   
3     13345825  A & G Hunt Families Trust T/A           13345825   
4     13295450     AA Company Pty Ltd -Lynora           13295450   

          Root Customer Name                  Orig Cust Name  
0             3k  & trading          3K Transport & Trading   
1              a & b grains                    A & B GRAINS   
2                a & e moon                      A & E Moon   
3  a & g hunt families  t/a   A & G Hunt Families Trust T/A   
4             aa    -lynora      AA Company Pty Ltd -Lynora   
    Customer ID                                  Customer Name  \
0  GSAUS_100012   AUSTRALIAN COUNTRY CHOICE PRODUCTION PTY LTD   
1   GSNZ_201420                                     Korston ,M   
2          7013



In [14]:
# do some manual matching and then re enter the csv
# have to reinput the adjusted file into a dataframe
filename_matched = '~/Documents/Projects/16_01_18_Graincorp/ParentInfo/TEST_customer_matching.csv'
df_manual_match = pandas.read_csv(filename_matched, encoding='latin-1')

# do some filtering and renaming of columns
df_man_match_filter1 = df_manual_match.loc[:,('Daughter Cust ID', 'Daughter Cust', 'Matched ID', 'Matched Name')]
df_man_match_filter1.rename(columns={'Daughter Cust ID':'Customer ID', 'Daughter Cust':'Customer Name', \
                                     'Matched ID':'Root Customer ID', 'Matched Name':'Root Customer Name'}, inplace=True)
df_man_match_filter1 = df_man_match_filter1[['Customer ID', 'Customer Name', 'Root Customer ID', 'Root Customer Name']]
print(df_man_match_filter1.columns.values)

# Option to output the file for further checking 
# df_man_match_filter1.to_csv("TEST_customer_manual_matching.csv")

['Customer ID' 'Customer Name' 'Root Customer ID' 'Root Customer Name']


In [18]:
# once all of the customers in the target file are matched then need to add the original master file
df_orig_master = pandas.read_csv(filename_master, encoding='latin-1')

df_orig_master_filtered = df_orig_master.loc[:,('Customer ID', 'Customer Name', 'Root Customer ID', 'Root Customer Name')]
print('Matched df length: ' + str(len(df_man_match_filter1)), \
      'Original df length: ' + str(len(df_orig_master_filtered)))

# append dfs
df_combined = df_man_match_filter1.append(df_orig_master_filtered, ignore_index=True)
df_combined.to_csv("TEST_ALLBUS_parent_into.csv")
print('Compiled df length: '+str(len(df_combined)))

Matched df length: 2297 Original df length: 2552
Compiled df length: 4849
