In [1]:
import chemsource as cs
import pandas as pd
import re
from tqdm import tqdm
from ast import literal_eval
from tqdm import tqdm

from helpers import *

tqdm.pandas()

In [6]:
user_openai_key = pd.read_csv("dorrestein_api_keys.txt", header=None).values[0][0]
user_ncbi_key = pd.read_csv("dorrestein_api_keys.txt", header=None).values[1][0]

modified_classification_prompt = "Classify this compound, COMPOUND_NAME, as any combination of the following: MEDICAL, ENDOGENOUS, FOOD, PERSONAL CARE, INDUSTRIAL. Note that MEDICAL refers to compounds actively used as approved medications in humans or in late-stage clinical trials in humans. Note that ENDOGENOUS refers to compounds that are produced by the human body specifically. ENDOGENOUS excludes essential nutrients that cannot be synthesized by human body. Note that FOOD refers to compounds present in natural food items or food additives. Note that INDUSTRIAL should be used only for synthetic compounds not used as a contributing ingredient in the medical, personal care, or food industries. Note that PERSONAL CARE refers to non-medicated compounds typically used for activities such as skincare, beauty, and fitness. Specify INFO instead if more information is needed. DO NOT MAKE ANY ASSUMPTIONS, USE ONLY THE INFORMATION PROVIDED. A classification of INFO will also be rewarded when correctly applied and is strongly encouraged if information is of poor quality, if there is not enough information, or if you are not completely confident in your answer.  Provide the output as a plain text separated by commas, and provide only the categories listed (either list a combination of INDUSTRIAL, ENDOGENOUS, PERSONAL CARE, MEDICAL, FOOD or list INFO), with no justification. Provided Information:\n"
model = cs.ChemSource()
model.configure(prompt=modified_classification_prompt, openai_key=user_openai_key, ncbi_key=user_ncbi_key, temperature=0, top_p=1, logprobs=True)

tqdm.pandas()

In [3]:
adrc_cs_data_path = "./data_out/adrc_complete_final.tsv"
dust_cs_data_path = "./data_out/dust_complete_final.tsv"
iss_cs_data_path = "./data_out/iss_complete_final.tsv"
rosmap_cs_data_path = "./data_out/rosmap_data_final_output.tsv"

adrc_data = pd.read_csv(adrc_cs_data_path, sep="\t", index_col=0)
adrc_data.drop(columns=["synonyms", "X.Scan.","chemsource_raw_output", "text_length", "is_first_pass"], inplace=True)
dust_data = pd.read_csv(dust_cs_data_path, sep="\t", index_col=0 )
dust_data.drop(columns=["synonyms", "X.Scan.","chemsource_raw_output", "text_length", "is_first_pass"], inplace=True)
iss_data = pd.read_csv(iss_cs_data_path, sep="\t", index_col=0)
iss_data.drop(columns=["synonyms", "X.Scan.","chemsource_raw_output", "text_length", "is_first_pass"], inplace=True)
rosmap_data = pd.read_csv(rosmap_cs_data_path, sep="\t", index_col=0)
# rosmap_data.rename(columns={"name_used": "compound_name"}, inplace=True)
rosmap_data.drop(columns=["compound_name", "synonyms", "chemsource_raw_output","text_length"], inplace=True)
rosmap_data.rename(columns={"name_used": "compound_name"}, inplace=True)

In [22]:
adrc_data["second_classification"] = None
adrc_data["second_classification_log_probs"] = None

adrc_data
for num, row in tqdm(adrc_data.iterrows()):
    classification_output = model.classify(row["compound_name"],row["text"])
    adrc_data["second_classification"][num] = classification_output[0]

    log_probs = dict()
    for i in classification_output[1].content:
        log_probs.update({i.token: i.logprob})
    

    adrc_data["second_classification_log_probs"][num] = log_probs
    


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adrc_data["second_classification"][num] = classification_output[0]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to updat

In [25]:
dust_data["second_classification"] = None
dust_data["second_classification_log_probs"] = None

for num, row in tqdm(dust_data.iterrows()):
    classification_output = model.classify(row["compound_name"],row["text"])
    dust_data["second_classification"][num] = classification_output[0]

    log_probs = dict()
    for i in classification_output[1].content:
        log_probs.update({i.token: i.logprob})

    dust_data["second_classification_log_probs"][num] = log_probs

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dust_data["second_classification"][num] = classification_output[0]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to updat

In [26]:
dust_data.to_csv("./data_out/dust_complete_final_second_pass.tsv", sep="\t")

In [27]:
iss_data["second_classification"] = None
iss_data["second_classification_log_probs"] = None

for num, row in tqdm(iss_data.iterrows()):
    classification_output = model.classify(row["compound_name"],row["text"])
    iss_data["second_classification"][num] = classification_output[0]

    log_probs = dict()
    for i in classification_output[1].content:
        log_probs.update({i.token: i.logprob})

    iss_data["second_classification_log_probs"][num] = log_probs

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  iss_data["second_classification"][num] = classification_output[0]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update

In [32]:
rosmap_data["second_classification"] = None
rosmap_data["second_classification_log_probs"] = None

for num, row in tqdm(rosmap_data.iterrows()):
    classification_output = model.classify(row["compound_name"],row["text"])
    rosmap_data["second_classification"][num] = classification_output[0]

    log_probs = dict()
    for i in classification_output[1].content:
        log_probs.update({i.token: i.logprob})

    rosmap_data["second_classification_log_probs"][num] = log_probs

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  rosmap_data["second_classification"][num] = classification_output[0]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to upd

In [33]:
rosmap_data.to_csv("./data_out/rosmap_data_final_output_second_pass.tsv", sep="\t")