In [25]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [26]:
df =  pd.read_csv('data/sec__edgar_company_info.csv')
df.head()

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


In [27]:
df.shape

(663000, 3)

In [28]:
import re

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [29]:
print('All 3-grams in "Paintball":')
ngrams('Paintball')

All 3-grams in "Paintball":


['Pai', 'ain', 'int', 'ntb', 'tba', 'bal', 'all']

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

names = df['Company Name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(names)

In [39]:
print(tf_idf_matrix[0])

ngrams('!J INC')

  (0, 14549)	0.15757684989695478
  (0, 812)	0.14545326532967898
  (0, 14951)	0.517420185391102
  (0, 1395)	0.828425757525274


['!J ', 'J I', ' IN', 'INC']

The last term (‘INC’) has a relatively low value, which makes sense as this term will appear often in the corpus, thus receiving a lower IDF weight.

In [44]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awsm_cossim_top(A, B, ntop, lower_bound=0):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
    
    idx_dtype = np.int32
    
    nnz_max = M*ntop
    
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    
    ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype),
                      np.asarray(A.indices, dtype=idx_dtype),
                      A.data,
                      np.asarray(B.indptr, dtype=idx_dtype),
                      np.asarray(B.indices, dtype=idx_dtype),
                      B.data,
                      ntop,
                      lower_bound,
                      indptr, indices, data)
    
    return csr_matrix((data, indices, indptr), shape=(M, N))

In [45]:
import time
t1 = time.time()
matches = awsm_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8) # similarity > 0.8
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 5915.475016593933
