In [19]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from annoy import AnnoyIndex
from tqdm import tqdm
import unidecode

## Load model

In [8]:
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')

  return torch._C._cuda_getDeviceCount() > 0


## Load data

In [21]:
df = pd.read_csv('data/classification-dataset-v1.csv.zip')
df.shape

(73974, 10)

In [22]:
df = df.dropna(subset=['company_name', 'homepage_text'])
df.shape

(73305, 10)

In [23]:
df['homepage_text'] = df['homepage_text'].apply(
    lambda x: unidecode.unidecode(str(x).lower())
)

In [24]:
df.sample(3)[['company_name', 'homepage_text']]

Unnamed: 0,company_name,homepage_text
62189,crisp malting group,telephone +44 (0)1328 829 391 email info@crisp...
37479,"livewell properties, llc",about development news ...
64616,gripper logistics llc | dba gripper group,envio g...


## Create embeddings

In [25]:
sentence = 'This is a sample sentence'
encoding = model.encode(sentence)
print(encoding.shape)

(768,)


In [26]:
encoding_size = encoding.shape[0]

t = AnnoyIndex(encoding_size, 'angular')  # Length of item vector that will be indexed
name_map = {}

for i, row in tqdm(df.iterrows()):
    try:
        t.add_item(i, model.encode(row['homepage_text']))
        name_map[i] = row['company_name']
    except e:
        print(f'Error: {e}')
        continue

0it [00:00, ?it/s]

73305it [2:02:40,  9.96it/s]


In [27]:
t.build(25) # 10 trees
t.save('company.ann')

True

In [None]:
t = AnnoyIndex(encoding_size, 'angular')
t.load('company.ann')

In [30]:
query = 'Healthcare services'
fetch_n = 10

encoding = model.encode(query)

results = t.get_nns_by_vector(encoding, fetch_n, search_k=-1, include_distances=False)

for result in results:
    print(name_map[result], df[df['company_name'] == name_map[result]]['homepage_text'].iloc[0])

us best medical       contact mail:  info@usbestmedical.com    home about us solutions contact                 us best medical   more info    welcome to         us best medical   more info    welcome to         medical tourism we're connecting patients with the best medical care and treatments in us healthcare system. learn more medical talent acquisition we're supporting international healthcare systems recruit the best talent in the us. learn more us best medical products we're helping international healthcare facilities acquire the best us medical products & solutions. learn more healthcare consulting services we're delivering the optimization of healthcare system patient flow, clinical outcomes, and patient satisfaction.. learn more innovative technology solutions we're delivering the best innovative technology solutions to improving profitability and satisfaction. learn more global agency network we're helping manufacturers expand their global footprint. contact us stay in touch w

In [31]:
query = 'Honey and cakes delivery'
fetch_n = 10

encoding = model.encode(query)

results = t.get_nns_by_vector(encoding, fetch_n, search_k=-1, include_distances=False)

for result in results:
    print(name_map[result], df[df['company_name'] == name_map[result]]['homepage_text'].iloc[0])

mullen's dairy bar                          serving up handcrafted goodness since 1932   we are now open!     eat visit cakes story partners         
more food             home about us our markets our brands gallery news contact us more...     gallery about us our markets cake brands at cake brands we work from a brc factory, manufacturing handmade cakes and desserts available at wholesale, on the high street, in coffee shops and across food service outlets.  we pride ourselves on our new product development and innovation that offers customer's a diverse range of products. our brands  contact read more >        
iris bakery             home about us menu order online faq contact more        iris tea & bakery 1/1 our favorite cakes iris tea & bakery offer an extraordinary selection of cakes for any special occasion. here is a couple of our editor's picks. order online like & follow visit us        
maple leaf bakery, inc.      mapleleafbakery.com           maple leaf foods  retail bak