In [3]:
import chemsource as cs
import pandas as pd
import numpy as np
from tqdm import tqdm
from ast import literal_eval
from helpers.classify import *
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=15)
tqdm.pandas()

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
# Create model and initialize to OpenAI GPT-4o
openai_key = pd.read_csv("../secrets/openai_api.txt", header=None, index_col=None).values[0][0]

BASE_PROMPT = "You are a helpful scientist that will classify the provided compounds using only the information provided as any combination of the following: MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE, INDUSTRIAL. Note that MEDICAL refers to compounds actively used as approved medications in humans or in late-stage clinical trials in humans. Note that ENDOGENOUS refers to compounds that are produced by the human body specifically. ENDOGENOUS excludes essential nutrients that cannot be synthesized by the human body. Note that FOOD refers to compounds present in natural food items or food additives. Note that INDUSTRIAL should be used only for synthetic compounds not used as a contributing ingredient in the medical, personal care, or food industries. Note that PERSONAL CARE refers to non-medicated compounds typically used for activities such as skincare, beauty, and fitness. Specify INFO instead if more information is needed. DO NOT MAKE ANY ASSUMPTIONS, USE ONLY THE INFORMATION PROVIDED AFTER THE COMPOUND NAME BY THE USER. A classification of INFO will also be rewarded when correctly applied and is strongly encouraged if information is of poor quality, if there is not enough information, or if you are not completely confident in your answer.  Provide the output as a plain text separated by commas, and provide only the categories listed (either list a combination of INDUSTRIAL, ENDOGENOUS, PERSONAL CARE, MEDICAL, FOOD or list INFO), with no justification. Provided Information:\n"
model_openai = cs.ChemSource()
model_openai.configure(
    model="gpt-4o-2024-11-20",
    prompt=BASE_PROMPT, 
    model_api_key=openai_key, 
    temperature=0, 
    top_p=1, logprobs=True)

# Create and initialize model to DeepSeek R1
deepinfra_key = pd.read_csv("../secrets/deepinfra_api.txt", header=None, index_col=None).values[0][0]

model_deepseek_v3 = cs.ChemSource()
model_deepseek_v3.configure(
    model="deepseek-ai/DeepSeek-V3",
    model_api_key=deepinfra_key,
    prompt=BASE_PROMPT,
    temperature=0,
    top_p=1,
    logprobs=True,
    base_url="https://api.deepinfra.com/v1/openai")


# Create GPT-4o model, allow for contextual information
CONTEXTUAL_PROMPT = "You are a helpful scientist that will classify the provided compounds using only the information provided as any combination of the following: MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE, INDUSTRIAL. Note that MEDICAL refers to compounds actively used as approved medications in humans or in late-stage clinical trials in humans. Note that ENDOGENOUS refers to compounds that are produced by the human body specifically. ENDOGENOUS excludes essential nutrients that cannot be synthesized by the human body. Note that FOOD refers to compounds present in natural food items or food additives. Note that INDUSTRIAL should be used only for synthetic compounds not used as a contributing ingredient in the medical, personal care, or food industries. Note that PERSONAL CARE refers to non-medicated compounds typically used for activities such as skincare, beauty, and fitness. Specify INFO instead if more information is needed. DO NOT MAKE ANY ASSUMPTIONS. You may use prior information or knowledge to help, but information inputted by the user takes precedence. A classification of INFO will also be rewarded when correctly applied and is strongly encouraged if information is of poor quality, if there is not enough information, or if you are not completely confident in your answer.  Provide the output as a plain text separated by commas, and provide only the categories listed (either list a combination of INDUSTRIAL, ENDOGENOUS, PERSONAL CARE, MEDICAL, FOOD or list INFO), with no justification. Provided Information:\n"
model_openai_contextual = cs.ChemSource()
model_openai_contextual.configure(
    model="gpt-4o-2024-11-20",
    prompt=CONTEXTUAL_PROMPT, 
    model_api_key=openai_key, 
    temperature=0, 
    top_p=1, logprobs=True)

Validation Dataset

In [8]:
# Read in data
validation_data = pd.read_csv("../data/cleaned_data/cleaned_validation_data_with_text.csv")
validation_data.drop_duplicates(subset=["name_used"], inplace=True)
validation_data['text'] = validation_data['text'].apply(literal_eval)
validation_data["site"] = validation_data["text"].apply(lambda x: x[0])
validation_data["text"] = validation_data["text"].apply(lambda x: x[1])

In [9]:
validation_data["chemsource_output_gpt-4o"] = validation_data.parallel_apply(lambda x: 
                                                        classify_with_log_probs(x["name_used"],x["text"], model_openai), 
                                                        axis=1)
validation_data.to_csv("../data/output/validation_data_classified_all.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=248), Label(value='0 / 248'))), HB…

In [9]:
validation_data = pd.read_csv("../data/output/validation_data_classified_all.csv")

In [None]:
validation_data["chemsource_output_deepseek-v3"] = validation_data.parallel_apply(lambda x:
                                                        classify_with_log_probs(x["name_used"],x["text"], model_deepseek_v3),
                                                        axis=1)
validation_data.to_csv("../data/output/validation_data_classified_all.csv", index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=248), Label(value='0 / 248'))), HB…

Additional Dataset

In [4]:
# Read in Data
all_data = pd.read_csv('../data/cleaned_data/all_cleaned_data_with_text.tsv', sep='\t')
all_data['text'] = all_data['text'].apply(literal_eval)
all_data["site"] = all_data["text"].apply(lambda x: x[0])
all_data["text"] = all_data["text"].apply(lambda x: x[1])

In [4]:
all_data["chemsource_output"] = all_data.progress_apply(lambda x: 
                                                        classify_with_log_probs(x["name_used"],x["text"], model_openai), 
                                                        axis=1)

100%|██████████| 1235/1235 [14:06<00:00,  1.46it/s]


In [6]:
all_data.to_csv('../data/output/all_data_classified_gpt-4o.tsv', sep='\t', index=False)

Mouse Dataset Reanalysis

In [8]:
mice_data = all_data[all_data["dataset"] == "mouse"]
mice_data["chemsource_output_gpt-4o_contextual"] = mice_data.parallel_apply(lambda x: classify_with_log_probs(x["name_used"],x["text"], model_openai_contextual), axis=1)
mice_data.to_csv('../data/output/mice_data_classified_gpt-4o_contextual.tsv', sep='\t', index=False)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=19), Label(value='0 / 19'))), HBox…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mice_data["chemsource_output_gpt-4o_contextual"] = mice_data.parallel_apply(lambda x: classify_with_log_probs(x["name_used"],x["text"], model_openai_contextual), axis=1)


Extra Controls

In [14]:
extra_controls = pd.read_csv("../data/cleaned_data/extra_controls_cleaned_data_with_text.tsv", sep="\t")
extra_controls['text'] = extra_controls['text'].apply(literal_eval)
extra_controls["site"] = extra_controls["text"].apply(lambda x: x[0])
extra_controls["text"] = extra_controls["text"].apply(lambda x: x[1])
extra_controls["chemsource_output"] = extra_controls.progress_apply(lambda x: 
                                                        classify_with_log_probs(x["name_used"],x["text"], model_openai), 
                                                        axis=1)
extra_controls.to_csv('../data/output/extra_controls_classified_gpt-4o.tsv', sep='\t', index=False)


100%|██████████| 1337/1337 [12:44<00:00,  1.75it/s]


In [5]:
extra_controls = pd.read_csv("../data/cleaned_data/extra_controls_cleaned_data_with_text.tsv", sep="\t")
extra_controls['text'] = extra_controls['text'].apply(literal_eval)
extra_controls["site"] = extra_controls["text"].apply(lambda x: x[0])
extra_controls["text"] = extra_controls["text"].apply(lambda x: x[1])


In [7]:
extra_controls.iloc[65]["text"]

"Xanthine ( or , from Ancient Greek ξανθός xanthós 'yellow' for its yellowish-white appearance; archaically xanthic acid; systematic name 3,7-dihydropurine-2,6-dione) is a purine base found in most human body tissues and fluids, as well as in other organisms. Several stimulants are derived from xanthine, including caffeine, theophylline, and theobromine. Xanthine is a product on the pathway of purine degradation. It is created from guanine by guanine deaminase. It is created from hypoxanthine by xanthine oxidoreductase. It is also created from xanthosine by purine nucleoside phosphorylase. Xanthine is subsequently converted to uric acid by the action of the xanthine oxidase enzyme. == Use and production == Xanthine is used as a drug precursor for human and animal medications, and is produced as a pesticide ingredient. == Clinical significance == Derivatives of xanthine (known collectively as xanthines) are a group of alkaloids commonly used for their effects as mild stimulants and as b

In [None]:
iloc[68]

featureID                                              32832.0
DF                                                    0.181641
synonyms     ('Limonene-1,2-epoxide', '(4r)-limonene-1,2-ep...
dataset                                                   food
text          In this work, we developed a solid lipid nano...
name_used                                 Limonene-1,2-epoxide
site                                                    PUBMED
Name: 68, dtype: object

In [None]:
iloc[68]

featureID                                              32832.0
DF                                                    0.181641
synonyms     ('Limonene-1,2-epoxide', '(4r)-limonene-1,2-ep...
dataset                                                   food
text          In this work, we developed a solid lipid nano...
name_used                                 Limonene-1,2-epoxide
site                                                    PUBMED
Name: 68, dtype: object

In [None]:
iloc[68]

featureID                                              32832.0
DF                                                    0.181641
synonyms     ('Limonene-1,2-epoxide', '(4r)-limonene-1,2-ep...
dataset                                                   food
text          In this work, we developed a solid lipid nano...
name_used                                 Limonene-1,2-epoxide
site                                                    PUBMED
Name: 68, dtype: object

In [8]:
classify_with_log_probs("Xanthine",extra_controls.iloc[65]["text"], model_openai)

('ENDOGENOUS, MEDICAL, INDUSTRIAL',
 [('END', -0.001113189267925918),
  ('OG', 0.0),
  ('ENO', -6.704273118884885e-07),
  ('US', 0.0),
  (',', -0.005234475247561932),
  (' MED', -0.0956529900431633),
  ('ICAL', 0.0),
  (',', -0.005240156780928373),
  (' INDUSTR', -0.03813421353697777),
  ('IAL', 0.0)])