In [25]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [26]:
df =  pd.read_csv('data/sec__edgar_company_info.csv')
df.head()

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


In [27]:
df.shape

(663000, 3)

In [28]:
import re

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [29]:
print('All 3-grams in "Paintball":')
ngrams('Paintball')

All 3-grams in "Paintball":


['Pai', 'ain', 'int', 'ntb', 'tba', 'bal', 'all']

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

names = df['Company Name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(names)

In [39]:
print(tf_idf_matrix[0])

ngrams('!J INC')

  (0, 14549)	0.15757684989695478
  (0, 812)	0.14545326532967898
  (0, 14951)	0.517420185391102
  (0, 1395)	0.828425757525274


['!J ', 'J I', ' IN', 'INC']

The last term (‘INC’) has a relatively low value, which makes sense as this term will appear often in the corpus, thus receiving a lower IDF weight.

In [44]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awsm_cossim_top(A, B, ntop, lower_bound=0):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
    
    idx_dtype = np.int32
    
    nnz_max = M*ntop
    
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    
    ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype),
                      np.asarray(A.indices, dtype=idx_dtype),
                      A.data,
                      np.asarray(B.indptr, dtype=idx_dtype),
                      np.asarray(B.indices, dtype=idx_dtype),
                      B.data,
                      ntop,
                      lower_bound,
                      indptr, indices, data)
    
    return csr_matrix((data, indices, indptr), shape=(M, N))

In [45]:
import time
t1 = time.time()
matches = awsm_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8) # similarity > 0.8
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 5915.475016593933


In [46]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
        
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similarity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similarity[index] = sparse_matrix.data[index]
        
    return pd.DataFrame({'left_side':left_side, 'right_side':right_side, 'similarity':similarity})

In [47]:
matches_df = get_matches_df(matches, names, top=100000)
matches_df = matches_df[matches_df['similarity'] < 0.99999]
matches_df.sample(20)

Unnamed: 0,left_side,right_side,similarity
37431,ADVISORS DISCIPLINED TRUST 694,ADVISORS DISCIPLINED TRUST 692,0.860952
27050,ADVISORS DISCIPLINED TRUST 1378,ADVISORS DISCIPLINED TRUST 1375,0.879955
45922,AGL LIFE ASSURANCE CO SEPARATE ACCOUNT VA 46,AGL LIFE ASSURANCE CO SEPARATE ACCOUNT VA 3,0.898953
58245,ALLSTATE FINANCING III,ALLSTATE FINANCING VIII,0.839127
78092,ANE TECHNOLOGIES INC,MICE TECHNOLOGIES INC,0.845709
75745,ANDERSON JULIE,ANDERSON JULIE L,0.932982
51905,ALDEBARAN FINANCIAL INC /TN/ /ADV,PLAN FINANCIAL INC /BD,0.981783
28702,ADVISORS DISCIPLINED TRUST 1528,ADVISORS DISCIPLINED TRUST 1524,0.881805
37356,ADVISORS DISCIPLINED TRUST 688,ADVISORS DISCIPLINED TRUST 68,0.928445
33962,ADVISORS DISCIPLINED TRUST 378,ADVISORS DISCIPLINED TRUST 377,0.871106
