In [1]:
import numpy as np
import pandas as pd

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173859e-02 -4.28516194e-02 -1.56286173e-02  1.40537629e-02
  3.95537689e-02  1.21796325e-01  2.94333510e-02 -3.17523517e-02
  3.54959853e-02 -7.93140233e-02  1.75878350e-02 -4.04369570e-02
  4.97259721e-02  2.54912712e-02 -7.18699917e-02  8.14968422e-02
  1.47071993e-03  4.79627252e-02 -4.50335853e-02 -9.92175192e-02
 -2.81769410e-02  6.45046085e-02  4.44670543e-02 -4.76217270e-02
 -3.52952331e-02  4.38671745e-02 -5.28566055e-02  4.33045381e-04
  1.01921476e-01  1.64072234e-02  3.26996520e-02 -3.45986746e-02
  1.21339448e-02  7.94871300e-02  4.58339136e-03  1.57778617e-02
 -9.68207233e-03  2.87626702e-02 -5.05806245e-02 -1.55793829e-02
 -2.87906975e-02 -9.62279364e-03  3.15556452e-02  2.27349643e-02
  8.71449485e-02 -3.85027565e-02 -8.84719193e-02 -8.75496678e-03
 -2.12343559e-02  2.08924487e-02 -9.02078152e-02 -5.25732450e-02
 -1.05638281e-02  2.88311373e-02 -1.61454845e-02  6.17838977e-03
 -1.23234

In [3]:
from sentence_transformers import SentenceTransformer, util

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))


The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939


In [4]:
# Single list of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6788
I love pasta 		 Do you like pizza? 		 Score: 0.5096
I love pasta 		 The new movie is so great 		 Score: 0.2560
I love pasta 		 The new movie is awesome 		 Score: 0.2440
A man is playing guitar 		 The cat plays in the garden 		 Score: 0.2105
The new movie is awesome 		 Do you like pizza? 		 Score: 0.1969
The new movie is so great 		 Do you like pizza? 		 Score: 0.1692
The cat sits outside 		 A woman watches TV 		 Score: 0.1310
The cat plays in the garden 		 Do you like pizza? 		 Score: 0.0900


In [5]:
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))





Query: A man is eating pasta.

Top 5 most similar sentences in corpus:
A man is eating food. (Score: 0.7035)
A man is eating a piece of bread. (Score: 0.5272)
A man is riding a horse. (Score: 0.1889)
A man is riding a white horse on an enclosed ground. (Score: 0.1047)
A cheetah is running behind its prey. (Score: 0.0980)




Query: Someone in a gorilla costume is playing a set of drums.

Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.6433)
A woman is playing violin. (Score: 0.2564)
A man is riding a horse. (Score: 0.1389)
A man is riding a white horse on an enclosed ground. (Score: 0.1191)
A cheetah is running behind its prey. (Score: 0.1080)




Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.8253)
A man is eating food. (Score: 0.1399)
A monkey is playing drums. (Score: 0.1292)
A man is riding a white horse on an enclosed ground. (Score: 0.1097)
A man is riding a 

In [6]:
df = pd.read_csv('annotations_20220816.tsv',delimiter = '\t', on_bad_lines='skip')

df.head()

  df = pd.read_csv('annotations_20220816.tsv',delimiter = '\t', on_bad_lines='skip')


Unnamed: 0,id,text_to_annotate,start,end,ann_text,definition
0,98791999,Virt - Vite 2.5 - 25 - 1 MG Oral Tablet Vitami...,0.0,3.0,Virt - Vite,"A mix of vitamins. It provides vitamin B-6, vi..."
1,98791999,Virt - Vite 2.5 - 25 - 1 MG Oral Tablet Vitami...,7.0,8.0,1,{DO NOT DEFINE}
2,98791999,Virt - Vite 2.5 - 25 - 1 MG Oral Tablet Vitami...,8.0,9.0,MG,"A tiny amount of something, usually a drug."
3,98791999,Virt - Vite 2.5 - 25 - 1 MG Oral Tablet Vitami...,9.0,10.0,Oral,Taken by mouth.
4,98791999,Virt - Vite 2.5 - 25 - 1 MG Oral Tablet Vitami...,10.0,11.0,Tablet,A pill.


In [7]:
df_term = df[['ann_text','definition']]

df_term['ann_text'] = df_term['ann_text'].astype(str)
df_term['definition'] = df_term['definition'].astype(str)
df_term.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_term['ann_text'] = df_term['ann_text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_term['definition'] = df_term['definition'].astype(str)


Unnamed: 0,ann_text,definition
0,Virt - Vite,"A mix of vitamins. It provides vitamin B-6, vi..."
1,1,{DO NOT DEFINE}
2,MG,"A tiny amount of something, usually a drug."
3,Oral,Taken by mouth.
4,Tablet,A pill.


In [8]:
df_term['ann_text'] = df_term['ann_text'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_term['ann_text'] = df_term['ann_text'].str.lower()


In [9]:
df_term.head()

Unnamed: 0,ann_text,definition
0,virt - vite,"A mix of vitamins. It provides vitamin B-6, vi..."
1,1,{DO NOT DEFINE}
2,mg,"A tiny amount of something, usually a drug."
3,oral,Taken by mouth.
4,tablet,A pill.


In [10]:
df_term_unique = df_term.drop_duplicates()

df_term_unique = df_term_unique[df_term_unique.definition != '{DO NOT DEFINE}']

print(len(df_term_unique))

42137


In [11]:
df_term_unique.head(30)

Unnamed: 0,ann_text,definition
0,virt - vite,"A mix of vitamins. It provides vitamin B-6, vi..."
2,mg,"A tiny amount of something, usually a drug."
3,oral,Taken by mouth.
4,tablet,A pill.
5,vitamin b complex,A group of vitamins and nutrients. The body n...
6,folic acid,A B vitamin.
7,vitamin c,A nutrient needed by the body to form and main...
9,tab,A pill.
10,po,By mouth.
11,qd,Every day.


In [12]:
corpus = pd.read_csv('df_annotations_all_definitions_unique.csv')

In [13]:
print(len(corpus))
print(len(pd.DataFrame(corpus.loc[corpus['definition'] != "not found in UMLS"])))

testing = pd.DataFrame(corpus.loc[corpus['definition'] != "not found in UMLS"])
testing.head(50)

87023
82471


Unnamed: 0.1,Unnamed: 0,name,concept_id,canonical_name,aliases,types,definition
2,0,oral,C0226896,Oral cavity,"['Oral', 'Buccal Cavity', 'Buccal cavity', 'Or...",['T030'],The oval-shaped oral cavity located at the ape...
3,1,oral,C0442027,Oral,"['Oral', 'Orally', 'PO - Oral', 'oral', 'Oral ...",['T082'],"Of, or relating to, or affecting, or for use i..."
4,2,oral,C1272919,Oral Dosage Form,"['Oral', 'Oral Dosage Form', 'oral dosage form...",['T122'],A substance intended for administration throug...
5,3,oral,C1527415,Oral Route of Drug administration,"['by mouth', 'PO - Per os', 'Oral Route of Adm...",['T169'],The introduction of a substance to the mouth o...
6,4,oral,C4521986,Oral (intended site),"['Oral', 'Oral (intended site)']",['T033'],An intended site for a dose form that is for a...
7,0,tablet,C0039225,Tablet Dosage Form,"['TAB', 'tab', 'TabletDrugForm', 'medicines ta...",['T122'],"Solid dosage forms, of varying weight, size, a..."
8,1,tablet,C1705223,Tablet Dosing Unit,"['Tablet Dosing Unit', '{Tablet}', 'Tablet', '...",['T081'],A dosing unit equal to the amount of active in...
9,2,tablet,C4319774,Tablet (unit of presentation),"['Tablet', 'Tablet (unit of presentation)']",['T081'],
10,3,tablet,C4722631,Tablet Dosage Form Category,"['Tablet', 'Tablet Dosage Form Category']",['T122'],A type of solid pharmaceutical dose form consi...
11,4,tablet,C0993159,Oral Tablet,"['oral tablets', 'Oral tablet', 'Conventional ...",['T122'],A tablet intended for oral administration.


In [14]:
corpus.head(30)

Unnamed: 0.1,Unnamed: 0,name,concept_id,canonical_name,aliases,types,definition
0,0,1,,not_found,not_found,,not found in UMLS
1,0,mg,,not_found,not_found,,not found in UMLS
2,0,oral,C0226896,Oral cavity,"['Oral', 'Buccal Cavity', 'Buccal cavity', 'Or...",['T030'],The oval-shaped oral cavity located at the ape...
3,1,oral,C0442027,Oral,"['Oral', 'Orally', 'PO - Oral', 'oral', 'Oral ...",['T082'],"Of, or relating to, or affecting, or for use i..."
4,2,oral,C1272919,Oral Dosage Form,"['Oral', 'Oral Dosage Form', 'oral dosage form...",['T122'],A substance intended for administration throug...
5,3,oral,C1527415,Oral Route of Drug administration,"['by mouth', 'PO - Per os', 'Oral Route of Adm...",['T169'],The introduction of a substance to the mouth o...
6,4,oral,C4521986,Oral (intended site),"['Oral', 'Oral (intended site)']",['T033'],An intended site for a dose form that is for a...
7,0,tablet,C0039225,Tablet Dosage Form,"['TAB', 'tab', 'TabletDrugForm', 'medicines ta...",['T122'],"Solid dosage forms, of varying weight, size, a..."
8,1,tablet,C1705223,Tablet Dosing Unit,"['Tablet Dosing Unit', '{Tablet}', 'Tablet', '...",['T081'],A dosing unit equal to the amount of active in...
9,2,tablet,C4319774,Tablet (unit of presentation),"['Tablet', 'Tablet (unit of presentation)']",['T081'],


In [15]:
general = pd.DataFrame(corpus.loc[corpus['name'] == "vitamin"])
general.head(30)

Unnamed: 0.1,Unnamed: 0,name,concept_id,canonical_name,aliases,types,definition
7752,0,vitamin,C0042890,Vitamins,"['Vitamin preparation, NOS', 'VITAMINS: MISCEL...","['T109', 'T121', 'T127']",Organic substances that are required in small ...
7753,1,vitamin,C2349136,Vitamin C Vitamins,['Vitamin C'],"['T109', 'T127']",A nutrient that the body needs in small amount...
7754,2,vitamin,C0035527,riboflavin,"['vitamin G', 'Riboflavin (substance)', 'Lacto...","['T109', 'T121', 'T127']","Nutritional factor found in milk, eggs, malted..."
7755,3,vitamin,C0042839,vitamin A,"['VITAMIN A', 'VITAMIN A PREPARATIONS', 'Retin...","['T109', 'T121', 'T127']",Retinol and derivatives of retinol that play a...
7756,4,vitamin,C0087161,all-trans-retinol,"['Vitamin A Alcohol', '3,7-dimethyl-9-(2,6, 6-...","['T109', 'T121', 'T127']",A nutrient that the body needs in small amount...


In [16]:
corpus = corpus[corpus['definition'].notna()]

corpus.head(20)

Unnamed: 0.1,Unnamed: 0,name,concept_id,canonical_name,aliases,types,definition
0,0,1,,not_found,not_found,,not found in UMLS
1,0,mg,,not_found,not_found,,not found in UMLS
2,0,oral,C0226896,Oral cavity,"['Oral', 'Buccal Cavity', 'Buccal cavity', 'Or...",['T030'],The oval-shaped oral cavity located at the ape...
3,1,oral,C0442027,Oral,"['Oral', 'Orally', 'PO - Oral', 'oral', 'Oral ...",['T082'],"Of, or relating to, or affecting, or for use i..."
4,2,oral,C1272919,Oral Dosage Form,"['Oral', 'Oral Dosage Form', 'oral dosage form...",['T122'],A substance intended for administration throug...
5,3,oral,C1527415,Oral Route of Drug administration,"['by mouth', 'PO - Per os', 'Oral Route of Adm...",['T169'],The introduction of a substance to the mouth o...
6,4,oral,C4521986,Oral (intended site),"['Oral', 'Oral (intended site)']",['T033'],An intended site for a dose form that is for a...
7,0,tablet,C0039225,Tablet Dosage Form,"['TAB', 'tab', 'TabletDrugForm', 'medicines ta...",['T122'],"Solid dosage forms, of varying weight, size, a..."
8,1,tablet,C1705223,Tablet Dosing Unit,"['Tablet Dosing Unit', '{Tablet}', 'Tablet', '...",['T081'],A dosing unit equal to the amount of active in...
10,3,tablet,C4722631,Tablet Dosage Form Category,"['Tablet', 'Tablet Dosage Form Category']",['T122'],A type of solid pharmaceutical dose form consi...


In [17]:
df_term_unique.head(20)

Unnamed: 0,ann_text,definition
0,virt - vite,"A mix of vitamins. It provides vitamin B-6, vi..."
2,mg,"A tiny amount of something, usually a drug."
3,oral,Taken by mouth.
4,tablet,A pill.
5,vitamin b complex,A group of vitamins and nutrients. The body n...
6,folic acid,A B vitamin.
7,vitamin c,A nutrient needed by the body to form and main...
9,tab,A pill.
10,po,By mouth.
11,qd,Every day.


In [18]:
df_term_unique.reset_index(drop=True, inplace=True)

df_term_unique.head(20)

Unnamed: 0,ann_text,definition
0,virt - vite,"A mix of vitamins. It provides vitamin B-6, vi..."
1,mg,"A tiny amount of something, usually a drug."
2,oral,Taken by mouth.
3,tablet,A pill.
4,vitamin b complex,A group of vitamins and nutrients. The body n...
5,folic acid,A B vitamin.
6,vitamin c,A nutrient needed by the body to form and main...
7,tab,A pill.
8,po,By mouth.
9,qd,Every day.


In [19]:

df_term_unique['sbert_def'] = ""

for i in range(len(df_term_unique)):
#     if i == 30:
#         break
#     print(i)
    term = df_term_unique['ann_text'].iloc[i]
#     print(term)
    general = corpus.loc[corpus['name'] == term]
#     print(len(general))
    if(len(general) != 0):
        corpus_term = general['definition'].tolist()
#         print(corpus_term[0])
        corpus_embeddings = embedder.encode(corpus_term, convert_to_tensor=True)
        query_term = df_term_unique['definition'].iloc[i]
        query_embedding = embedder.encode(query_term, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=1)
        for score, idx in zip(top_results[0], top_results[1]):
            df_term_unique.at[i,'sbert_def'] = corpus_term[idx]

In [20]:
df_term_unique.head(30)

Unnamed: 0,ann_text,definition,sbert_def
0,virt - vite,"A mix of vitamins. It provides vitamin B-6, vi...",
1,mg,"A tiny amount of something, usually a drug.",not found in UMLS
2,oral,Taken by mouth.,A substance intended for administration throug...
3,tablet,A pill.,A tablet intended for oral administration.
4,vitamin b complex,A group of vitamins and nutrients. The body n...,"A group of water-soluble vitamins, some of whi..."
5,folic acid,A B vitamin.,A member of the vitamin B family that stimulat...
6,vitamin c,A nutrient needed by the body to form and main...,A nutrient that the body needs in small amount...
7,tab,A pill.,A tablet intended for oral administration.
8,po,By mouth.,not found in UMLS
9,qd,Every day.,Occurring or done each day.


In [22]:
print(len(df_term_unique))

42137


In [21]:
df_term_unique.to_csv("sentence_bert_top_definitions.csv")