Let's see how embedding models work!

In [None]:
pip install sentence_transformers

In [12]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

# This is a 384 dimensional vector   
print(len(embeddings[0]))



Sentence: This framework generates embeddings for each input sentence.
Embedding: [-1.19531704e-02 -5.56294024e-02 -8.24255869e-03  8.89046583e-03
  2.76842304e-02  1.13988042e-01  1.46987597e-02 -3.18958312e-02
  4.14518118e-02 -8.18855315e-02  1.41326627e-02 -2.03336254e-02
  4.07750793e-02  2.26285271e-02 -4.78438623e-02  7.63346255e-02
 -3.29895248e-03  4.05392684e-02 -4.15111929e-02 -9.59573090e-02
  2.75881728e-03  6.11733682e-02  5.33367544e-02 -4.46903892e-02
 -5.07696867e-02  4.29955795e-02 -5.94074614e-02  7.90485647e-03
  1.03387214e-01  1.84300859e-02  2.68558227e-02 -2.85280440e-02
  3.10076475e-02  7.76293725e-02 -1.35658216e-03  1.04126269e-02
 -1.15095936e-02  3.83570902e-02 -4.92046103e-02 -1.52510246e-02
 -3.55921052e-02 -3.63350869e-03  2.81276368e-02  3.14543694e-02
  6.72142506e-02 -3.93150933e-02 -1.09064087e-01 -1.97516419e-02
 -2.29214616e-02  4.11495268e-02 -8.46842453e-02 -5.84471002e-02
 -6.20924681e-03  2.70355288e-02 -8.66034068e-03  2.39158776e-02
 -1.9887

Download the dataset from: https://drive.google.com/file/d/1dNLfX-REKVBRPviA_wNUYISSjk5_nNbz/view?usp=sharing

# Dataset work

In [13]:
import pickle as pkl
import pandas as pd

with open("description_df.pkl", "rb") as f:
    dataset = pkl.load(f)

df = pd.DataFrame(dataset)

In [14]:
df.head()

Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,...,meshheadings,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,annotation,description,numSentences
0,1,Acetylcarnitine,Acetyl-DL-carnitine|acetylcarnitine|DL-O-Acety...,203.24,C9H17NO4,66.4,214.0,0.4,14,0,...,Acetylcarnitine,Classification|Drug and Medication Information...,9,,20050623,3WAY PHARM INC|A2B Chem|AA BLOCKS|AbaChemScene...,Chemical Vendors|Curation Efforts|Governmental...,D002491 - Central Nervous System Agents > D018...,An acetic acid ester of CARNITINE that facilit...,8
1,3,"5,6-Dihydroxycyclohexa-1,3-diene-1-carboxylic ...","5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic ...",156.14,C7H8O4,77.8,229.0,-0.3,11,3,...,,Classification|Literature|Patents,5,,20040916,AAA Chemistry|ABI Chem|Achemica|BenchChem|ChEB...,Chemical Vendors|Curation Efforts|Governmental...,,"2,3-dihydroxy-2,3-dihydrobenzoic acid is a cyc...",5
2,4,1-Aminopropan-2-ol,1-Aminopropan-2-ol|1-AMINO-2-PROPANOL|78-96-6|...,75.11,C3H9NO,46.2,22.9,-1.0,5,2,...,,Biological Test Results|Chemical and Physical ...,14,155|157|161|165|167|175|1188|23443|158688|6516...,20050326,001Chemical|3B Scientific (Wuhan) Corp|3WAY PH...,Chemical Vendors|Curation Efforts|Governmental...,,Monoisopropanolamine appears as a colorless li...,13
3,5,3-Amino-2-oxopropyl phosphate,3-Amino-2-oxopropyl phosphate|3-amino-2-oxopro...,169.07,C3H8NO5P,110.0,162.0,-5.0,10,3,...,,Classification|Literature|Patents,5,,20050601,AAA Chemistry|ABI Chem|BenchChem|BIND|BioCyc|C...,Chemical Vendors|Curation Efforts|Governmental...,,3-Amino-2-oxopropyl phosphate is a metabolite ...,4
4,6,"1-Chloro-2,4-dinitrobenzene","1-chloro-2,4-dinitrobenzene|2,4-Dinitrochlorob...",202.55,C6H3ClN2O4,91.6,224.0,2.3,13,0,...,Dinitrochlorobenzene,Biological Test Results|Chemical and Physical ...,14,155|157|161|165|167|175|179|192|220|300|302|11...,20050326,3B Scientific (Wuhan) Corp|3WAY PHARM INC|A&J ...,Chemical Vendors|Curation Efforts|Governmental...,C308 - Immunotherapeutic Agent > C2139 - Immun...,Dinitrochlorobenzene is an aromatic hydrocarbo...,13


In [15]:
# This is what we want to embed
df.iloc[0]["description"]

'An acetic acid ester of CARNITINE that facilitates movement of ACETYL COA into the matrices of mammalian MITOCHONDRIA during the oxidation of FATTY ACIDS., O-acetylcarnitine is an O-acylcarnitine having acetyl as the acyl substituent. It has a role as a human metabolite. It is functionally related to an acetic acid. It is a conjugate base of an O-acetylcarnitinium., L-Acetylcarnitine is a metabolite found in or produced by Saccharomyces cerevisiae., Acetylcarnitine is a natural product found in Pseudo-nitzschia multistriata, Euglena gracilis, and other organisms with data available.'

In [16]:
num_rows = df.shape[0]
num_rows

328395

In [17]:
# Check if any strings are longer than 512 characters, drop them
mask = df['cmpdname'].str.len() > 512

has_long_strings = mask.any()
print("Any strings longer than 512 characters:", has_long_strings)
if has_long_strings:
    long_strings_indices = df.index[mask].tolist()
    print("Row indices with strings longer than 512 characters:")
    print(long_strings_indices)
    df = df.drop(long_strings_indices)


Any strings longer than 512 characters: True
Row indices with strings longer than 512 characters:
[1659, 14658, 19618, 47600, 71010, 72641, 73389, 79317, 81445, 81689, 82247, 82812, 83090, 84315, 85624, 89799, 93944, 104357, 104358, 104906, 111890, 111917, 112003, 112131, 112163, 112222, 112229, 112267, 114278, 114987, 115027, 117388, 117922, 119152, 121455, 125057, 135844, 135845, 141959, 142071, 142586, 142685, 142686, 142687, 142688, 142689, 142690, 142691, 142692, 142693, 142772, 142773, 142799, 143067, 143101, 143367, 143368, 143369, 143370, 143770, 143771, 143773, 143774, 144028, 144029, 145884, 146396, 146838, 146922, 147141, 149356, 150015, 150069, 150508, 156274, 156330, 160149, 181420, 185601, 185623, 185624, 185647, 185928, 185947, 187414, 187415, 187522, 191185, 191186, 192347, 194046, 194152, 194322, 194549, 194590, 197856, 198373, 198708, 198830, 199322, 203963, 203964, 203965, 213692, 214052, 215148, 215497, 215901, 216174, 217484, 217583, 218574, 220782, 220954, 221154,

In [18]:
# this should print false
mask2 = df['cmpdname'].str.len() > 512
has_long_strings2 = mask2.any()
print("Any strings longer than 512 characters:", has_long_strings2)

Any strings longer than 512 characters: False


In [19]:
def is_float(x):
    return isinstance(x, float)

# Apply this function to create a boolean mask
mask3 = df['cmpdname'].apply(is_float)

# Get the indices where the column has float values
indices_with_floats = df.index[mask3].tolist()

print("Indices of rows with floats in the 'text_column':")
print(indices_with_floats)

Indices of rows with floats in the 'text_column':
[111966, 112016, 112017, 112061, 112206, 121454, 133107, 146751, 152380, 183669, 189469, 189517, 203962, 204267, 213740, 213741, 216427, 224006, 224932, 228797, 232623, 233278, 233289, 233291, 233877, 236886, 237246, 237953, 237955, 237960, 237961, 237965, 237997, 237998, 237999, 243474, 249684, 264540, 264542, 264543, 264548, 264549, 264682, 265344, 265716, 265746, 265793, 265797, 265913, 265934, 265939, 265971, 266117, 266141, 266172, 266193, 266232, 266268, 266320, 266375, 266394, 266395, 266453, 266462, 266480, 266501, 266506, 266537, 266546, 266556, 266561, 266637, 266652, 270599, 274414, 274634, 276401, 324984, 324991, 324999, 325058, 325069, 325096, 325099, 325107, 325164, 325189, 325261, 325263, 325322, 327579, 327651, 328181, 328332, 328394]


In [20]:
df

Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,...,meshheadings,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,annotation,description,numSentences
0,1,Acetylcarnitine,Acetyl-DL-carnitine|acetylcarnitine|DL-O-Acety...,203.240,C9H17NO4,66.4,214.0,0.4,14,0,...,Acetylcarnitine,Classification|Drug and Medication Information...,9,,20050623,3WAY PHARM INC|A2B Chem|AA BLOCKS|AbaChemScene...,Chemical Vendors|Curation Efforts|Governmental...,D002491 - Central Nervous System Agents > D018...,An acetic acid ester of CARNITINE that facilit...,8
1,3,"5,6-Dihydroxycyclohexa-1,3-diene-1-carboxylic ...","5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic ...",156.140,C7H8O4,77.8,229.0,-0.3,11,3,...,,Classification|Literature|Patents,5,,20040916,AAA Chemistry|ABI Chem|Achemica|BenchChem|ChEB...,Chemical Vendors|Curation Efforts|Governmental...,,"2,3-dihydroxy-2,3-dihydrobenzoic acid is a cyc...",5
2,4,1-Aminopropan-2-ol,1-Aminopropan-2-ol|1-AMINO-2-PROPANOL|78-96-6|...,75.110,C3H9NO,46.2,22.9,-1.0,5,2,...,,Biological Test Results|Chemical and Physical ...,14,155|157|161|165|167|175|1188|23443|158688|6516...,20050326,001Chemical|3B Scientific (Wuhan) Corp|3WAY PH...,Chemical Vendors|Curation Efforts|Governmental...,,Monoisopropanolamine appears as a colorless li...,13
3,5,3-Amino-2-oxopropyl phosphate,3-Amino-2-oxopropyl phosphate|3-amino-2-oxopro...,169.070,C3H8NO5P,110.0,162.0,-5.0,10,3,...,,Classification|Literature|Patents,5,,20050601,AAA Chemistry|ABI Chem|BenchChem|BIND|BioCyc|C...,Chemical Vendors|Curation Efforts|Governmental...,,3-Amino-2-oxopropyl phosphate is a metabolite ...,4
4,6,"1-Chloro-2,4-dinitrobenzene","1-chloro-2,4-dinitrobenzene|2,4-Dinitrochlorob...",202.550,C6H3ClN2O4,91.6,224.0,2.3,13,0,...,Dinitrochlorobenzene,Biological Test Results|Chemical and Physical ...,14,155|157|161|165|167|175|179|192|220|300|302|11...,20050326,3B Scientific (Wuhan) Corp|3WAY PHARM INC|A&J ...,Chemical Vendors|Curation Efforts|Governmental...,C308 - Immunotherapeutic Agent > C2139 - Immun...,Dinitrochlorobenzene is an aromatic hydrocarbo...,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328390,168266256,CID 168266256,9002-62-4,3417.600,C160H194N22O54S4,1130.0,5660.0,,240,4,...,Prolactin,,0,,20230605,"Cooke Chemical Co., Ltd",Chemical Vendors,,A lactogenic hormone secreted by the adenohypo...,5
328391,168266268,CID 168266268,8002-50-4,2867.500,C147H151Cl8N27O18,469.0,4770.0,,200,4,...,,,0,,20230605,"Cooke Chemical Co., Ltd",Chemical Vendors,,"Oils, edible: fish is a pale yellow oily liqui...",4
328392,168266335,CID 168266335,CID 162451877|64475-85-0,13508.655,Tb85,0.0,0.0,,85,0,...,,,0,,20230605,"Cooke Chemical Co., Ltd",Chemical Vendors,,Turpentine substitute appears as a clear color...,4
328393,168266352,CID 168266352,CID 162301785|25038-44-2|68551-12-2,632.300,C26H30B2N10O4S2,225.0,427.0,,44,2,...,,,0,,20230605,"Cooke Chemical Co., Ltd",Chemical Vendors,,Alcohol c-12 c-16 poly (1-6) ethoxylate appear...,10


In [21]:
# remove cases where the compound name is not a string
df2 = df[df['cmpdname'].apply(lambda x: isinstance(x, str))]
print(df2.shape, df.shape)

(327706, 42) (327801, 42)


In [None]:
# drop unnecesary columns

In [None]:
# randomly sample 50k entries from your DF (more than 100k will require a paid account)
# be careful here! make sure you're uploading the data here or you will run out of credits

Let's get everything embedded!
1. Make an account on https://atlas.nomic.ai/
2. In your terminal run pip3 install nomic
3. run nomic login "[insert API key from nomic]"

Take a look at your embedding graph once it's done populating!

Follow https://docs.nomic.ai/atlas/capabilities/vectors to embed your dataframe! Hint: the indexed field is the description, since that is what we want to embed

Import nomic, and use nomic.embed to write a function embed a query

In [None]:
def search(query):
    #embed the query with nomic.embed (look at the docs if you are confused)

    with dataset.wait_for_dataset_lock():
        neighbors, distances = map.embeddings.vector_search(queries=query_vector, k=10)

    print("Neighbor IDs:", neighbors)
    # use the docs here https://docs.nomic.ai/atlas/capabilities/vectors to query the nomic dataset and get the compound names of the closest 10 neighbors
    


In [None]:
# try running your function with a few different queries + implement a UI! You can use the input function in python