In [1]:
import pandas as pd
import numpy as np

In [2]:
usecols = ['identifier', 'shipper_party_name', 'shipper_party_address_1',
       'shipper_party_address_2', 'shipper_party_address_3',
       'shipper_party_address_4', 'city', 'state_province', 'zip_code',
       'country_code']

In [3]:
dtype = {'identifier':str,'shipper_party_name':str, 'shipper_party_address_1':str,
       'shipper_party_address_2':str, 'shipper_party_address_3':str,
       'shipper_party_address_4':str, 'city':'category', 'state_province':'category', 'zip_code':'category',
       'country_code':'category'}

In [4]:
data = pd.read_csv('raw_data/2018/AMSShippers-2018.csv',usecols=usecols,dtype=dtype)

In [None]:
data['shipper_party_name'].value_counts().head()

ORIENT EXPRESS CONTAINER CO., LTD.     48203
HONOUR LANE SHIPPING LIMITED           33600
BEIJING KANG JIE KONG INTERNATIONAL    28668
DE WELL CONTAINER SHIPPING INC.        27913
HECNY SHIPPING LIMITED                 24245
Name: shipper_party_name, dtype: int64

In [None]:
data.loc[data['shipper_party_name'].str.contains('dhl',case=False,regex=False,na=False)]

Unnamed: 0,identifier,shipper_party_name,shipper_party_address_1,shipper_party_address_2,shipper_party_address_3,shipper_party_address_4,city,state_province,zip_code,country_code
186,20180101204,DHL GLOBAL FORWARDING MALAYSIA SDN,"LEVEL 9, MCT TOWER ONE CITY, JALAN",SUBANG JAYA SELANGOR,MALAYSIA,,,,,
421,20180101520,DHL GLOBAL FORWARDING (CANADA) INC,230 - 13091 VANIER PL,V6V 2J1,,,"RICHMOND, BC",BC,V6V2J1,CA
615,20180101792,DHL ISC (HK) LIMITED O/B,HONGKONG NEWCT DEVELOPMENT LIMITED,"ROOM 1103,HANG SENG MONGKOK","BUILDING,677 NATHAN ROAD,",,,,,
822,201801011097,DHL GLOBAL FORWARDING CHINA COMPANY,7F PHILIPS RESEARCH AND DEVELOPMENT,SHENZHEN GD 518000,CHINA,,,,,
1205,201801011590,DHL LOGISTICS PRIVATE LIMITED,"REGUS BUSINESS CENTRE, 403-404","4TH FLOOR, OPP:HDFC BANK, RAMNAGAR","VISAKHAPATNAM-530002.AP,INDIA",,91-8916630119 TEL,EX,91-89166,30
1207,201801011592,DHL LOGISTICS PRIVATE LIMITED,"REGUS BUSINESS CENTRE, 403-404","4TH FLOOR, OPP:HDFC BANK, RAMNAGAR","VISAKHAPATNAM-530002.AP,INDIA",,91-8916630119 TEL,EX,91-89166,30
1428,201801011866,DHL GLOBAL FORWARDING (CHINA) COMPA,"7 FLOOR BUILDING A,PACIFIC PLAZA, N",NINGBO ZJ 315001,CHINA,,,,,
1436,201801011883,DHL GLOBAL FORWARDING CHINA CO LTD,SHENZHEN BRANCH,7F PHILIPS RESEARCH AND DEVELOPMENT,BUILDING NO12 SHIHUA RD FUTIAN FTZ,,,,,
1630,201801012123,DHL GLOBAL FORWARDING CHINA CO LTD,SHENZHEN BRANCH,7F PHILIPS RESEARCH AND DEVELOPMENT,BUILDING NO12 SHIHUA RD FUTIAN FTZ,,,,,
2431,201801013145,DHL LOGISITICS (CAMBODIA) LTD.,O/B HANA (CAMBODIA) I INC,"PHUM TRAPAING KRASANG, SANGKAT TRAP","KHAN POSENCHEY, PHNOM PENH, CAMBODI",,855-23216104 TEL,EX,855-2388,59


### Match company by TFIDF
https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

In [None]:
import re
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = pd.Series(data['shipper_party_name'].unique()).dropna()
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [None]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

In [None]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [None]:
matches_df = get_matches_df(matches, company_names, top=100000)
matches_df.to_pickle('matches_df')