In [1]:
import pandas as pd

In [2]:
a = pd.read_parquet("../../data/public/harmonized/iss_harmonized.parquet")
a

Unnamed: 0,FEATURE_ID,COMPOUND_NAME,SYNONYMS,DETECTION_FREQUENCY
0,19,Massbank:CE000230 Tyrosine,"[Tyrosine, H-dl-tyr-oh, Tyrosine, dl-, Tyrosin...",0.066390
1,313,"NCGC00385412-01_C21H32O10_2,4-Pentadienoic aci...","[(2z,4e)-5-[(1r,3r,5r,8s)-8-hydroxy-1,5-dimeth...",0.020747
2,333,Phenylalanine - 40.00 eV,"[Phenylalanine, 2-amino-3-phenylpropanoic acid...",0.228216
3,466,"2-[(4-hydroxy-3,5-dimethoxyphenyl)methoxy]-6-(...","[2-[(4-hydroxy-3,5-dimethoxyphenyl)methoxy]-6-...",0.210235
4,929,D-TRYPTOPHAN - 50.0 eV,"[Tryptophan, H-d-trp-oh, Tryptophane, Dtryptop...",0.048409
...,...,...,...,...
89,25752,Contaminants septum vial Thermo C4000-53 and C...,[Contaminants septum vial Thermo C4000-53 and ...,0.031812
90,25858,ACARBOSE-unclear if this is accurate,[ACARBOSE-unclear if this is accurate],0.024896
91,26019,Spectral Match to Benzyldodecyldimethylammoniu...,"[Benzododecinium, Ajatin, Benzyldodecyldimethy...",0.038728
92,26087,Spectral Match to 1-Hexadecanoyl-2-(9Z-octadec...,"[Pc(16:0/18:1(9z)), Palmitoyloleoylphosphatidy...",0.002766


In [3]:
from chemsource import ChemSource
from pandarallel import pandarallel
import os

def retrieve_text_synonyms_list(synonyms_list, model, source_priority="WIKIPEDIA"):
    """
    Retrieve text from a list of synonyms.
    
    Args:
        synonyms_list (list): A list of synonyms.
    
    Returns:
        tuple: A tuple containing the best synonym, the source of text, and the text itself.
    """
    
    results = []
    for synonym in synonyms_list:
        if isinstance(synonym, str):
            source, text = model.retrieve(synonym)
            if source == source_priority:
                return synonym, source, text
            results.append((synonym, source, text))

    results_filtered = [result for result in results if result[2] != "NO_RESULTS"]

    if results_filtered:
        return results_filtered[0]
    else:
        return None, None, None

def retrieve_text(harmonized_dataframe, ncbi_key=None, synonyms_column="SYNONYMS", source_column="SOURCE", updated_name_column="USED_NAME", text_column="TEXT"):
    """
    Retrieve text from the harmonized dataframe based on the specified columns.
    
    Args:
        harmonized_dataframe (pd.DataFrame): The harmonized dataframe.
        synonyms_column (str): The column containing synonyms.
        source_column (str): The column containing the source of the data.
        updated_name_column (str): The column containing the updated name.
        text_column (str): The column containing the text to be retrieved.
    
    Returns:
        pd.DataFrame: A DataFrame with the retrieved text.
    """
    harmonized_dataframe = harmonized_dataframe.copy()
    
    if ncbi_key:
        model = ChemSource(ncbi_key=ncbi_key)
        slow = False
    else:
        model = ChemSource()
        slow = True

    if slow:

        for index, row in harmonized_dataframe.iterrows():
            synonyms_list = row[synonyms_column]
            best_synonym, source, text = retrieve_text_synonyms_list(synonyms_list, model)
            harmonized_dataframe.loc[index, updated_name_column] = best_synonym
            harmonized_dataframe.loc[index, source_column] = source
            harmonized_dataframe.loc[index, text_column] = text

        return harmonized_dataframe

    else:
        pandarallel.initialize(progress_bar=True, nb_workers=max(8, os.cpu_count() - 1))
        retrieval_results = harmonized_dataframe[synonyms_column].parallel_apply(
            lambda synonyms_list: retrieve_text_synonyms_list(synonyms_list, model)
        )

        harmonized_dataframe[updated_name_column] = retrieval_results.apply(lambda x: x[0])
        harmonized_dataframe[source_column] = retrieval_results.apply(lambda x: x[1])
        harmonized_dataframe[text_column] = retrieval_results.apply(lambda x: x[2])

        return harmonized_dataframe


In [None]:
NCBI_API_KEY = "bdd2f83e20dc27d1e257d3896d036fd0a108"
retrieve_text(a, ncbi_key=NCBI_API_KEY)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12), Label(value='0 / 12'))), HBox…



  lis = BeautifulSoup(html).find_all('li')


In [24]:
a["SYNONYMS"][19]

array(['Bis(2,2,6,6-tetramethyl-4-piperidyl) sebacate',
       'Bis(2,2,6,6-tetramethylpiperidin-4-yl) decanedioate',
       'Tinuvin 770', 'Bis(2,2,6,6-tetramethyl-4-piperidyl)sebacate',
       'Sanol', 'Sanol 770', 'Eversorb 90', 'Sumisorb 577',
       'Tinuvin 770df', 'Tinuvin 770ls', 'Sanol ls 700', 'Sanol ls 770',
       'Mark la 77', 'Bis(2,2,6,6-tetramethyl-4-piperidinyl)sebacate',
       'Bis(2,2,6,6-tetramethyl-4-piperidinyl) sebacate', 'Ls 770',
       'Bis(2,2,6,6-tetramethylpiperidin-4-yl) sebacate',
       'Bis(2,2,6,6-tetramethyl-4-piperidinyl) decanedioate', 'T 770',
       'Sebacic acid bis(2,2,6,6-tetramethyl-4-piperidyl) ester',
       'Decanedioic acid, 1,10-bis(2,2,6,6-tetramethyl-4-piperidinyl) ester',
       'Antioxidant 770', 'Hillite 77', 'Viosorb 04', 'Adeka la 77',
       'Uvinul 4077h',
       'Decanedioic acid bis(2,2,6,6-tetramethyl-4-piperidyl) ester',
       'Tinuvin770', 'Lowilite 77', 'Gw 480', 'Tn 770', 'Uvaseb 770',
       'Ec 258-207-9', 'Cs-w013829'