In [1]:
import chemsource as cs
import pandas as pd
import re
from tqdm import tqdm
from ast import literal_eval
from tqdm import tqdm

from helpers import *

tqdm.pandas()

Configure chemsource Model

In [2]:
user_openai_key = pd.read_csv("api_keys.txt", header=None).values[0][0]
user_ncbi_key = pd.read_csv("api_keys.txt", header=None).values[1][0]

model = cs.ChemSource()
model.configure(openai_key=user_openai_key, ncbi_key=user_ncbi_key)

model.configuration()

{'openai_key': '********************************************************************************************************************************************************************',
 'ncbi_key': '************************************',
 'model': 'gpt-4-0125-preview',
 'prompt': 'Classify this compound, COMPOUND_NAME, as any combination of the following: MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE, INDUSTRIAL. Note that ENDOGENOUS refers to compounds that are human synthesized. ENDOGENOUS excludes essential nutrients that cannot be synthesized by human body. Note that FOOD refers to compounds present in natural food items. Note that INDUSTRIAL should be used only for compounds not used as a contributing ingredient in the medical, personal care, or food industries. Note that PERSONAL CARE refers to non-medicated compounds typically used for activities such as skincare, beauty, and fitness. Specify INFO instead if more information is needed. DO NOT MAKE ANY ASSUMPTIONS, USE ONLY THE INFORMAT

Read and Preprocess Data

In [6]:
raw_rosmap_data = pd.read_csv('data_in/ROSMAP_annotation_full_metadata_cleaned.tsv', sep='\t')
rosmap_data = raw_rosmap_data.copy()
rosmap_data["synonyms"] = raw_rosmap_data["synonyms"].apply(literal_eval)   

print(f"Length of unfiltered dataset : {len(rosmap_data)}")
rosmap_data.drop_duplicates(subset=["compound_name"], inplace=True)
print(f"Length of dataset (no duplicates): {len(rosmap_data)}")

rosmap_data["synonyms"] = rosmap_data["synonyms"].apply(lambda x: filter_synonym_list(x))
rosmap_data["synonyms"] = rosmap_data["synonyms"].apply(lambda x: preprocessing_function_synonyms(x))
rosmap_data = rosmap_data[rosmap_data["synonyms"].map(len) > 0]

rosmap_data["synonyms"] = rosmap_data["synonyms"].apply(lambda x: x[:6] if isinstance(x, list) else x)

print(f"Length of dataset (after filtering): {len(rosmap_data)}")

Length of unfiltered dataset : 229
Length of dataset (no duplicates): 174
Length of dataset (after filtering): 153


In [12]:
rosmap_data["synonyms"][:2]

0    [Histidine, H-his-oh, Glyoxaline-5-alanine, An...
1    [N-acetyl-l-methionine, N-acetylmethionine, Ac...
Name: synonyms, dtype: object

100%|██████████| 2/2 [00:02<00:00,  1.34s/it]


0    (Histidine, ((WIKIPEDIA, Histidine (symbol His...
1    (N-acetyl-l-methionine, ((PUBMED,  Methionine ...
Name: synonyms, dtype: object

In [7]:
rosmap_data

Unnamed: 0,compound_name,synonyms
0,L-histidine,"[Histidine, H-his-oh, Glyoxaline-5-alanine, An..."
1,N-Acetyl-L-methionine,"[N-acetyl-l-methionine, N-acetylmethionine, Ac..."
2,SCHEMBL21797220,[(2r)-3-(octadecanoyloxy)-2-(tetradecanoyloxy)...
4,lactulose,"[Lactulose, Bifiteral, Cephulac, Chronulac, Co..."
6,Phenylacetylglutamine,"[Phenylacetylglutamine, Phenylacetyl l-glutami..."
...,...,...
221,Ephedrine,"[Ephedrine, (-)-ephedrine, Ephedrin, L(-)-ephe..."
222,DODECANEDIOIC ACID,"[Dodecanedioic acid, Decamethylenedicarboxylic..."
223,Lysyl-Isoleucine,"[Lysyl-isoleucine, Lys-ile, H-lys-ile-oh, Isol..."
224,Icariside F2,"[Icariside f2, (2r,3s,4s,5r,6r)-2-[[(2r,3r,4r)..."


In [11]:
raw_rosmap_data.loc[221]

compound_name                                            Ephedrine
synonyms         ['Ephedrine', 'l-Ephedrine', '(-)-Ephedrine', ...
Name: 221, dtype: object