In [1]:
from regex_patterns import *
from utils import *

# this is a dummy example
example = {
    'DOI':"12.34/1234a", 
    'paragraphs':[
        {"text":"Introduction - Gold nanostars (AuNSs) have been used widely.... Thes nanostars are tunable....  Also, SiO2 nanorods (NRs) are.... ",
        "is_recipe":0},
        {"text":"This is a made-up paragraph, just as an example for the search-based parser!! AuNSs were synthesized by a seed-mediated method. First, gold seeds were prepared. Briefly, HAuCl4·3H2O (1mL, 2.0 × 10-3 M) was mixed with 0.4 g of hexadecyltrimethylammonium bromide (CTAB). Then, a 5 mL solution containing 60 µM of ice-cold NaBH4 and 7mm CTAC was mixed with this. In another flask, 8.888 ml of 0.009 M AgNO3 was mixed with AA (0.1 mmol, 20mL). An aliquot of seed solution (200µL) was mixed with this solution and was aged for 12 h, then was centrifuged and dispersed in BDAC (1mL, 1mM). The TEM analysis showed that this resulted in high yield star-like nanoparticles.",
         "is_recipe":1},
        {"text":"The AuNSs synthesized had average length, diamger and aspect ratio of 10 ± 2.0 nm, between 20 and 25 nm, and up to 4.",
            "is_recipe":0},
    ]}

In [2]:
papertext,recipetext="",""
for par in example['paragraphs']:
    papertext+=par['text']+"\n"

# acronym catcher
acronym_table = morph_acronym_catcher(papertext,target='Au',debug=False)
print(acronym_table)

# convert acronyms
papertext,recipetext="",""
for par in example['paragraphs']:
    par['text_converted']=convert_goldmorphacronym(par['text'],acronym_table)
    papertext+=par['text_converted']+"\n"
    if par['is_recipe']==1:
        recipetext+=par['text_converted']+"\n"


{'AuNS': 'Gold nanostar'}


In [3]:
# morphology detection
morphs = detect_anytarget_morph(target='Au', text_input = recipetext)
print(morphs)


[{'text': 'Gold nanostars', 'groups': ['Au'], 'span': (78, 92), 'category': 'Str'}]


In [4]:
#subrecipe separation
try:
    subrecipe_output=subrecipe_separate(recipetext,HAuCl4switch=True,initiate_whenprecamounts=True)
except:#Filter subrecipe_fatalerror
    print("\nsubrecipe ERROR, reciptext:\n",recipetext)

#Filter subrecipe error
if not isinstance(subrecipe_output,tuple) or not isinstance(subrecipe_output[0],list): 
    print("\nsubrecipe ERROR, reciptext:\n",recipetext)

else:
    seedtext, growthtext = "",""
    seedslnidx, growthslnidx, mixslndix, seedamount, seedamountunit, status = subrecipe_output
    recipetext_sentobject,pars_containprecamount = pars_sents_tokenizer(recipetext)
    recipetext_sentobject = [comp['senttext'] for comp in recipetext_sentobject]
    # Sentence tokenizers are different for two methods.
    # we use sidx of CDETokenizer to compare with MatER
    for sidx in range(len(recipetext_sentobject)):
        sent=recipetext_sentobject[sidx]
        if sidx in seedslnidx:
            seedtext+=" "+sent
        elif sidx in growthslnidx:
            if check_stringsinsent(breakstrings, sent):
                growthtext+= " "+min([sent.split(breakstr)[0] for breakstr in breakstrings],key=lambda p:len(p))+"."
                discard_MatER_flag=True
                break
            else:
                growthtext+=" "+sent
    print(f"Seed solution recipe:\n{seedtext}\n\nGrowth solution recipe:\n{growthtext}")



Seed solution recipe:
 First, gold seeds were prepared. Briefly, HAuCl4·3H2O (1mL, 2.0 × 10-3 M) was mixed with 0.4 g of hexadecyltrimethylammonium bromide (CTAB). Then, a 5 mL solution containing 60 µM of ice-cold NaBH4 and 7mm CTAC was mixed with this.

Growth solution recipe:
 In another flask, 8.888 ml of 0.009 M AgNO3 was mixed with AA (0.1 mmol, 20mL). An aliquot of seed solution (200µL) was mixed with this solution and was aged for 12 h, then was.


In [5]:
# precursor & amount detection
from pprint import pprint
precsamount_seed = parse_prec_amounts_regex(seedtext)
precsamount_growth = parse_prec_amounts_regex(growthtext)

seedvolunt = seedamount+seedamountunit
seedvol=float(0)
if isinstance(seedvolunt,list) and len(seedvolunt)==2:
    seedvol = parsefrom_amountlist({'precursor_category': 'H2O','precsubcat': 'H2O', 'amount': [seedvolunt]},solvent=True)[1]

precscatamount_growth,finalvol = precnormalize_amount(precsamount_growth,include_vol_inL=seedvol,version="Concentration",output_finalvol=True)
precscatamount_seed = precnormalize_amount(precsamount_seed,include_vol_inL=0,version="Concentration",output_finalvol=False)

#Filter precursor error
if not isinstance(precscatamount_growth,list) or not isinstance(precscatamount_seed,list) or len(precscatamount_growth)==0 or len(precscatamount_seed)==0:
    print("precursor error")
else:
    if len(precscatamount_seed)>0 and seedvol>0 and finalvol>0:
        seedovergrowth=seedvol/finalvol
    else:
        seedovergrowth=0

extracted_recipe = {
    "SeedSln":precscatamount_seed,
    "GrowthSln":precscatamount_growth,
    "SeedSlnRaw":precsamount_seed,
    "GrowthSlnRaw":precsamount_growth,
    "SeedoverGrowth":seedovergrowth,
    }
pprint(extracted_recipe)

{'GrowthSln': [{'amount': 0.003437843784378438, 'precursor_category': 'AA'},
               {'amount': 0.00275, 'precursor_category': 'AgNO3'}],
 'GrowthSlnRaw': [{'AgNO3': ['8.888', 'ml', '0.009', 'M']},
                  {'AA': ['0.1', 'mmol', '20', 'mL']}],
 'SeedSln': [{'amount': 0.005833333333333334, 'precursor_category': 'CTAC'},
             {'amount': 0.1829240407920611, 'precursor_category': 'CTAB'},
             {'amount': 0.0003333333333333333, 'precursor_category': 'AuCl4-'},
             {'amount': 5e-05, 'precursor_category': 'BH4-'}],
 'SeedSlnRaw': [{'HAuCl4·3H2O': ['1', 'mL', '2.0 × 10-3', 'M']},
                {'CTAB': ['0.4', 'g']},
                {'NaBH4': ['5', 'mL', '60', 'µM']},
                {'CTAC': ['3.5000000000000004e-05', 'mol']}],
 'SeedoverGrowth': 0.006875687568756875}
