In [None]:
import pandas as pd
import numpy as np

Unique names? Prefix (DHL...)?

In [None]:
# Ignore 'trade_update_date', 'run_date', 'vessel_name', 'secondary_notify_party_1','container_number'
usecols = ['identifier','port_of_unlading','estimated_arrival_date','foreign_port_of_lading'
           ,'record_status_indicator','place_of_receipt', 'port_of_destination'
           ,'foreign_port_of_destination','actual_arrival_date'
           ,'consignee_name','consignee_address','consignee_contact_name'
           ,'consignee_comm_number_qualifier','consignee_comm_number'
           ,'shipper_party_name', 'shipper_address','shipper_contact_name'
           ,'shipper_comm_number_qualifier','shipper_comm_number'
           ,'description_sequence_number', 'piece_count', 'description_text'
           ,'harmonized_number', 'harmonized_value'
           ,'harmonized_weight','harmonized_weight_unit']

In [None]:
dtype = {'identifier':str,'port_of_unlading':'category','foreign_port_of_lading':'category'
         ,'record_status_indicator':'category','place_of_receipt':'category'
         ,'port_of_destination':'category','foreign_port_of_destination':'category'
         ,'consignee_name':str,'consignee_address':str,'consignee_contact_name':str
         ,'consignee_comm_number_qualifier':str,'consignee_comm_number':str
         ,'shipper_party_name':str,'shipper_address':str,'shipper_contact_name':str
         ,'shipper_comm_number_qualifier':str,'shipper_comm_number':str
         ,'description_sequence_number':'int16', 'piece_count':'float32', 'description_text':str
         ,'harmonized_number':str, 'harmonized_value':'float32'
         ,'harmonized_weight':'float32','harmonized_weight_unit':'category'}

In [None]:
parse_dates = ['estimated_arrival_date','actual_arrival_date']

In [None]:
data= pd.read_csv('RawData/2018/BillofLadingSummary-2018.csv'
                  ,usecols=usecols,dtype=dtype,parse_dates=parse_dates)
                  #,nrows=1000000)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
def count_country(df,name):
    cols = ['port_of_unlading','foreign_port_of_lading','place_of_receipt'
            ,'port_of_destination','foreign_port_of_destination'
           ,'consignee_address','shipper_address']
    print('total rows is {}'.format(data.shape[0]))
    for i,col in enumerate(cols):
        temp_col = data[col].str.contains(name,case=False,regex=False)
        print('{} contains {} of rows of {}'.format(col,temp_col.sum(),name))
        if i==0:
            sum_col = temp_col.copy()
        else:
            sum_col = sum_col.add(temp_col,fill_value=0)
    print('{} of rows are {}'.format(sum_col[sum_col>0].sum(),name))

In [None]:
#count_country(data,'belgium')

### Match company by FuzzyWuzzy

In [None]:
company_names = data['shipper_party_name'].unique()

In [None]:
len(company_names)

In [None]:
type(pd.Series(company_names).iloc[2])

In [None]:
pd.Series(company_names).sort_values().head()

### Match company by TFIDF
https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

In [None]:
import re
def ngrams(string, n=3):
    #string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = pd.Series(data['shipper_party_name'].unique()).dropna()
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [None]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

In [None]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [None]:
matches_df = get_matches_df(matches, company_names, top=100000)
matches_df.to_pickle('matches_df')