In [2]:
# https://www.kaggle.com/datasets/arshkon/linkedin-job-postings

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from annoy import AnnoyIndex
from tqdm import tqdm
import unidecode

In [3]:
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')

sentence = 'This is a sample sentence'
encoding = model.encode(sentence)
print(encoding.shape)

  return torch._C._cuda_getDeviceCount() > 0


In [4]:
df = pd.read_csv(
    'data/linkedin/company_details/companies.csv'
)

In [5]:
df.set_index('company_id', inplace=True)

In [6]:
df = df.dropna(subset=['description', 'name'])
df.shape

(5999, 9)

In [7]:
df.sample(3)

Unnamed: 0_level_0,name,description,company_size,state,country,city,zip_code,address,url
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
82120874,Aarna Software and Solutions LLC,Prospering in the industry for more than 10 ye...,2.0,WY,US,SHERIDAN,82801,30 N GOULD ST,https://www.linkedin.com/company/asasusagroup
1768090,Perry Homes,"A luxury homebuilder founded in 1967, Perry Ho...",5.0,TX,US,Houston,77017,Gulf Fwy.,https://www.linkedin.com/company/perry-homes
2472062,Modern Family Law,Modern Family Law is a national family law fir...,2.0,CO,US,Denver,80246,4500 Cherry Creek South Drive,https://www.linkedin.com/company/modern-family...


In [8]:
df['description'] = df['description'].apply(lambda x: unidecode.unidecode(x.lower()))

In [10]:
encoded_descriptions = model.encode(df['description'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/188 [00:00<?, ?it/s]

In [11]:
df['encoded_description'] = encoded_descriptions.tolist()

In [12]:
df.to_csv(
    'data/linkedin/company_details/companies_encoded.csv',
)

In [9]:
df = pd.read_csv(
    'data/linkedin/company_details/companies_encoded.csv'
)
df['encoded_description'] = df['encoded_description'].apply(
    lambda x: [float(v) for v in x[1:-1].split(',')]
)

In [10]:
df.sample(3)

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,encoded_description
4550,18438915,Pacific Inventory Consultants LLC,"our company provides consulting, buy/sell and ...",2.0,WA,US,Manchester,98353,PO Box 546,https://www.linkedin.com/company/pacific-inven...,"[-0.05795583873987198, 0.5794211030006409, -0...."
4063,10462531,Thor Companies,thor companies(r) is a specialist recruitment ...,2.0,England,GB,London,SW8 1RL,68-70 South Lambeth Road,https://www.linkedin.com/company/thor-companie...,"[0.19136837124824524, 0.17068326473236084, 0.1..."
5492,77767313,Challenge Center Neuro Rehab and Wellness,challenge center is an exceptional not-for-pro...,1.0,California,US,La Mesa,91942,5540 Lake Park Way,https://www.linkedin.com/company/challenge-cen...,"[-0.3120168447494507, -0.3369400203227997, 0.2..."


In [11]:
encoding_size = encoding.shape[0]

t = AnnoyIndex(encoding_size, 'angular')  # Length of item vector that will be indexed
name_map = {}

for i, company_idx in tqdm(enumerate(df.index)):
    t.add_item(i, df.loc[company_idx]['encoded_description'])
    name_map[i] = company_idx

t.build(25) # 10 trees
t.save('linkedin_company.ann')

5999it [00:00, 15475.67it/s]


True

In [None]:
t = AnnoyIndex(encoding_size, 'angular')
t.load('linkedin_company.ann')

In [16]:
query = 'non profit bee keeper'
fetch_n = 10

encoding = model.encode(query)

results = t.get_nns_by_vector(encoding, fetch_n, search_k=-1, include_distances=False)

for result in results:
    company = df.loc[result]
    print(company['name'], company['description'])

Non-Profit Organization our mission is to help local and national non-profit organizations by assisting them restructure programs, events, sponsorships, grants, board of director infrastructure, bylaws creation, mission statements and more. 
FabFitFun founded in 2010 by co-ceos daniel and michael broukhim and editor-in-chief katie echevarria rosen kitchens, fabfitfun is a lifestyle membership and shopping experience whose mission is to deliver happiness and wellbeing to everyone, everywhere. its flagship product, the fabfitfun box, delivers a curated collection of full-size products across beauty, fashion, fitness, wellness, home, and tech - each season. in addition to the box, fabfitfun members receive access to year-round perks including the fabfitfun online community, members-only shopping experiences, exclusive digital content, and more. join fabfitfun by visiting fabfitfun.com.
A Free Bird™ a free bird(tm) is a non-profit arts organization based in ny. a free bird(tm)mission is to

In [23]:
query = 'A small digital healthcare company.'
fetch_n = 3

encoding = model.encode(query)

results = t.get_nns_by_vector(encoding, fetch_n, search_k=-1, include_distances=False)

for result in results:
    company = df.loc[result]
    print(company['name'], company['country'], company['company_size'], )
    print(company['description'])
    print('---')

Medline Industries, LP US 7.0
medline is a healthcare company-a manufacturer, distributor and so much more, doing business in more than 125 countries and territories around the world. we provide the quality medical products and solutions our customers need to deliver their best care to every person in every care setting. together, we free up the clinical and supply chain resources required to improve the overall operating performance of healthcare.

in a complex healthcare world, medline strives to help our customers achieve both clinical and financial success. we do that through a personalized approach to listen and better understand our customers' needs in an environment that tirelessly demands lower costs and better outcomes. 

our responsiveness and commitment to making healthcare run better is evident in our actions every day. this is who we are. this is why customers choose us as their trusted, integrated business partner.
---
K Health US 3.0
we're the clinical primary care compa

In [27]:
query = 'Healthcare services.'
fetch_n = 3

encoding = model.encode(query)

results = t.get_nns_by_vector(encoding, fetch_n, search_k=-1, include_distances=False)

for result in results:
    company = df.loc[result]
    print(company['name'], company['country'], company['company_size'], )
    print(company['description'])
    print('---')

Guthrie US 6.0
health care
---
WebTPA US 4.0
your employees. your healthcare. your way.
---
HCA Healthcare US 7.0
at hca healthcare, we are driven by a single mission:  above all else, we are committed to the care and improvement of human life.

at hca healthcare we recognize the significant responsibility we have as a leading healthcare provider within each of the communities we serve, as well as the opportunity we have to improve the lives of the patients for whom we are entrusted to care. through the compassion, knowledge and skill of our caregivers and our unique ability to leverage our scale and innovation, hca healthcare is a learning health system that uses our approximately 37 million annual patient encounters to advance science, improve patient care and save lives.

hca healthcare is one of the nation's leading providers of healthcare services, comprising 180 hospitals and approximately more than 2,300 sites of care, including surgery centers, freestanding ers, urgent care cen