In [11]:
import chemsource as cs
import pandas as pd
import numpy as np
from tqdm import tqdm
from ast import literal_eval
from helpers.classify import *
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=15)
tqdm.pandas()

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
# Create model and initialize to OpenAI GPT-4o
openai_key = pd.read_csv("../secrets/openai_api.txt", header=None, index_col=None).values[0][0]

NO_RAG_PROMPT = "You are a helpful scientist that will classify the provided compounds as any combination of the following: MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE, INDUSTRIAL. Note that MEDICAL refers to compounds actively used as approved medications in humans or in late-stage clinical trials in humans. Note that ENDOGENOUS refers to compounds that are produced by the human body specifically. ENDOGENOUS excludes essential nutrients that cannot be synthesized by the human body. Note that FOOD refers to compounds present in natural food items or food additives. Note that INDUSTRIAL should be used only for synthetic compounds not used as a contributing ingredient in the medical, personal care, or food industries. Note that PERSONAL CARE refers to non-medicated compounds typically used for activities such as skincare, beauty, and fitness. Specify INFO instead if more information is needed. DO NOT MAKE ANY ASSUMPTIONS. A classification of INFO will also be rewarded when correctly applied and is strongly encouraged if your information is of poor quality, if you do not have enough information, or if you are not completely confident in your answer. Provide the output as a plain text separated by commas, and provide only the categories listed (either list a combination of INDUSTRIAL, ENDOGENOUS, PERSONAL CARE, MEDICAL, FOOD or list INFO), with no justification."
model_openai_no_rag = cs.ChemSource()
model_openai_no_rag.configure(
    model="gpt-4.1-2025-04-14",
    prompt=NO_RAG_PROMPT, 
    search=False,
    model_api_key=openai_key, 
    temperature=0, 
    top_p=1, logprobs=True)


In [3]:
model_openai_no_rag.classify("", "aspirin")

('MEDICAL',
 ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='MED', bytes=[77, 69, 68], logprob=0.0, top_logprobs=[]), ChatCompletionTokenLogprob(token='ICAL', bytes=[73, 67, 65, 76], logprob=0.0, top_logprobs=[])], refusal=None))

In [9]:
validation_data = pd.read_csv('../data/output/validation_data_classified_all.csv')

In [None]:
validation_data = pd.read_csv("../data/cleaned_data/cleaned_validation_data_with_text.csv")
validation_data.drop_duplicates(subset=["name_used"], inplace=True)
validation_data['text'] = validation_data['text'].apply(literal_eval)
validation_data["site"] = validation_data["text"].apply(lambda x: x[0])
validation_data["text"] = validation_data["text"].apply(lambda x: x[1])

In [17]:
validation_data["chemsource_output_gpt-4-1"] = validation_data.parallel_apply(lambda x: 
                                                        classify_with_log_probs("",x["name_used"], model_openai_no_rag), 
                                                        axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=331), Label(value='0 / 331'))), HB…

In [21]:
validation_data.to_csv("../data/output/validation_data_classified_all.csv", index=False)

Classification Using Search Function

In [3]:
# Create model and initialize to OpenAI GPT-4o

SEARCH_PROMPT = "You are a helpful scientist that will classify the provided compounds as any combination of the following: MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE, INDUSTRIAL. Note that MEDICAL refers to compounds actively used as approved medications in humans or in late-stage clinical trials in humans. Note that ENDOGENOUS refers to compounds that are produced by the human body specifically. ENDOGENOUS excludes essential nutrients that cannot be synthesized by the human body. Note that FOOD refers to compounds present in natural food items or food additives. Note that INDUSTRIAL should be used only for synthetic compounds not used as a contributing ingredient in the medical, personal care, or food industries. Note that PERSONAL CARE refers to non-medicated compounds typically used for activities such as skincare, beauty, and fitness. Specify INFO instead if more information is needed. DO NOT MAKE ANY ASSUMPTIONS. A classification of INFO will also be rewarded when correctly applied and is strongly encouraged if your information is of poor quality, if you do not have enough information, or if you are not completely confident in your answer. Provide the output as a plain text separated by commas, and provide only the categories listed (either list a combination of INDUSTRIAL, ENDOGENOUS, PERSONAL CARE, MEDICAL, FOOD or list INFO), with no justification. Provide sources as another plain text comma separated list separated from the first list by a semicolon (;). DO NOT PROVIDE ANYTHING EXCEPT COMMA SEPARATED RAW URLs AFTER THE SEMICOLON. Provided compound: "
model_openai_search = cs.ChemSource()
model_openai_search.configure(
    model="gpt-4.1",
    prompt=SEARCH_PROMPT, 
    search=True,
    force_search=True,
    search_context="medium",
    model_api_key=openai_key, 
    temperature=0, 
    top_p=1, logprobs=False)


In [12]:
validation_data = pd.read_csv('../data/output/validation_data_classified_all.csv')
# Split validation data into 50 chunks
validation_data_chunks = np.array_split(validation_data, 50)
validation_data_chunk_outnames = ["../data/output/search_split/" + "validation_data_classified" + "_chunk_" + str(i) + ".csv" for i in range(50)]

  return bound(*args, **kwds)


In [None]:
validation_data_chunks = validation_data_chunks[40:]
validation_data_chunk_outnames = validation_data_chunk_outnames[40:]
for dataframe, outname in zip(validation_data_chunks, validation_data_chunk_outnames):
    dataframe["chemsource_output_search_gpt"] = dataframe.parallel_apply(lambda x: model_openai_search.classify(name=x["name_used"], information=""), axis=1)
    dataframe.to_csv(outname, index=True)
    

  return bound(*args, **kwds)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7), Label(value='0 / 7'))), HBox(c…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7), Label(value='0 / 7'))), HBox(c…

In [13]:
# Recombine chunks
validation_data_chunks = [pd.read_csv(outname, index_col=0) for outname in validation_data_chunk_outnames]
validation_data = pd.concat(validation_data_chunks, ignore_index=False)

In [15]:
validation_data.to_csv("../data/output/validation_data_classified_all_3_methods.csv", index=False)

In [58]:
type(validation_data_chunks[0])

pandas.core.frame.DataFrame