In [1]:
import pandas as pd
from pandarallel import pandarallel
from helpers.preprocessing import *
import os
from ast import literal_eval
import chemsource as cs

pandarallel.initialize(nb_workers=2, progress_bar=True)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [7]:
# Create retrieval model
ncbi_key = pd.read_csv("../secrets/ncbi_api.txt", header=None).values[0][0]
model = cs.ChemSource(ncbi_key=ncbi_key)

Main Dataset

In [5]:
# Read in and Preprocess Drug Library dataset
validation_dataset = pd.read_csv("../data/raw_validation_data/20240513_druglib_manual_class_with_synonyms.csv")
validation_dataset.drop(columns=["compound_name"], inplace=True)
validation_dataset.dropna(subset=["synonyms"], inplace=True)
validation_dataset["synonyms"] = validation_dataset["synonyms"].parallel_apply(lambda x: (preprocess_chemical(filter_synonym_list(try_literal_eval(x))))[:5])
validation_dataset["synonyms"] = validation_dataset["synonyms"].apply(tuple)
validation_dataset.drop_duplicates(subset=["synonyms"], inplace=True)
validation_dataset.drop(validation_dataset[validation_dataset["synonyms"].apply(len) == 0].index, inplace=True)

# Retrieve text
validation_dataset["text"] = validation_dataset["synonyms"].parallel_apply(lambda x: list_retrieve(x, model))
validation_dataset["name_used"] = validation_dataset["text"].apply(lambda x: x[0] if x else None)

# Save to files
validation_dataset.to_csv("../data/cleaned_data/cleaned_validation_data.csv", index=False)
validation_dataset_with_text = validation_dataset.dropna(subset=["text"])
validation_dataset_with_text.to_csv("../data/cleaned_data/cleaned_validation_data_with_text.csv", index=False)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2979), Label(value='0 / 2979'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2877), Label(value='0 / 2877'))), …



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


Auxiliary Datasets

In [None]:
# Define datasets
datasets = [os.path.join("../data/raw_data", x) for x in os.listdir("../data/raw_data")]
dataset_names = ["iss", "mouse", "dust", "adrc_plasma", "rosmap", "adrc"]

# Combine datasets and label with names
all_data = pd.DataFrame()
for dataset in datasets:
    data = pd.read_csv(dataset, on_bad_lines='skip', sep="\t")
    data["dataset"] = dataset_names[datasets.index(dataset)]
    all_data = pd.concat([all_data, data])

# Preprocess data
all_data.drop(columns=["compound_name"], inplace=True)
all_data.dropna(subset=["synonyms"], inplace=True)
all_data["synonyms"] = all_data["synonyms"].parallel_apply(lambda x: (preprocess_chemical(filter_synonym_list(try_literal_eval(x))))[:5])
all_data["synonyms"] = all_data["synonyms"].apply(tuple)
all_data.drop_duplicates(subset=["synonyms", "dataset"], inplace=True)
all_data.drop(all_data[all_data["synonyms"].apply(len) == 0].index, inplace=True)

# Retrieve text
all_data["text"] = all_data["synonyms"].parallel_apply(lambda x: list_retrieve(x, model))
all_data["name_used"] = all_data["text"].apply(lambda x: x[0] if x else None)

# Clean retrieved text and save data
all_data["text"] = all_data["text"].apply(lambda x: x[1] if x else None)
all_data.to_csv("../data/cleaned_data/all_cleaned_data.tsv", index=False, sep="\t")
all_data_with_text = all_data.dropna(subset=["text"])
all_data_with_text = all_data_with_text.drop(all_data_with_text[all_data_with_text["text"] == (None, None)].index)
all_data_with_text.to_csv("../data/cleaned_data/all_cleaned_data_with_text.tsv", index=False, sep="\t")

Additional Auxiliary Datasets

In [8]:
extra_controls = ["../data/raw_data_frequencies/food_annotation_full_metadata_cleaned.tsv",
                  "../data/raw_data_frequencies/PCP_annotation_full_metadata_cleaned.tsv"]
extra_controls_names = ["food", "personal"]

extra_controls_data = pd.DataFrame()
for dataset in extra_controls:
    data = pd.read_csv(dataset, on_bad_lines='skip', sep="\t")
    data["dataset"] = extra_controls_names[extra_controls.index(dataset)]
    extra_controls_data = pd.concat([extra_controls_data, data])

# Preprocess data
extra_controls_data.drop(columns=["compound_name"], inplace=True)
extra_controls_data.dropna(subset=["synonyms"], inplace=True)
extra_controls_data["synonyms"] = extra_controls_data["synonyms"].parallel_apply(lambda x: (preprocess_chemical(filter_synonym_list(try_literal_eval(x))))[:5])
extra_controls_data["synonyms"] = extra_controls_data["synonyms"].apply(tuple)
extra_controls_data.drop_duplicates(subset=["synonyms", "dataset"], inplace=True)
extra_controls_data.drop(extra_controls_data[extra_controls_data["synonyms"].apply(len) == 0].index, inplace=True)
# Retrieve text
extra_controls_data["text"] = extra_controls_data["synonyms"].parallel_apply(lambda x: list_retrieve(x, model))
extra_controls_data["name_used"] = extra_controls_data["text"].apply(lambda x: x[0] if x else None)
# Clean retrieved text and save data
extra_controls_data["text"] = extra_controls_data["text"].apply(lambda x: x[1] if x else None)
extra_controls_data.to_csv("../data/cleaned_data/extra_controls_cleaned_data.tsv", index=False, sep="\t")
extra_controls_data_with_text = extra_controls_data.dropna(subset=["text"])
extra_controls_data_with_text = extra_controls_data_with_text.drop(extra_controls_data_with_text[extra_controls_data_with_text["text"] == (None, None)].index)
extra_controls_data_with_text.to_csv("../data/cleaned_data/extra_controls_cleaned_data_with_text.tsv", index=False, sep="\t")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4942), Label(value='0 / 4942'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1084), Label(value='0 / 1084'))), …



  lis = BeautifulSoup(html).find_all('li')


  lis = BeautifulSoup(html).find_all('li')


Detection Frequencies

In [2]:
frequency_datasets = [os.path.join("../data/raw_data_frequencies", x) for x in os.listdir("../data/raw_data_frequencies")]

In [4]:
frequency_datasets = [os.path.join("../data/raw_data_frequencies", x) for x in os.listdir("../data/raw_data_frequencies")]
freq_dataset_names = ["adrc_plasma", "food", "iss","adrc","mouse", "rosmap", "personal","dust"]

# Combine datasets and label with names
all_freq_data = pd.DataFrame()
for dataset in frequency_datasets:
    if dataset.endswith(".csv"):
        data = pd.read_csv(dataset, on_bad_lines='skip')
    else:
        data = pd.read_csv(dataset, on_bad_lines='skip', sep="\t")    
    data["dataset"] = freq_dataset_names[frequency_datasets.index(dataset)]
    all_freq_data = pd.concat([all_freq_data, data])
all_freq_data.to_csv("../data/cleaned_data/all_cleaned_data_frequencies.tsv", index=False, sep="\t")