In [1]:
from IPython.display import clear_output, display
from PIL import Image
import json
from utils import *

chemist_name = get_user_name()
dataset = 'USPTO_50K'
samp_iter = 5
sampled_data = load_sampled_data(dataset, chemist_name, samp_iter)
accepted_templates, _ = load_fixed_templates(dataset, chemist_name, samp_iter)
remapped_rxn_dict = {}
remapped_temp_dict = {}

print ('Chemist Name: %s, Correcting %d reaction data' % (chemist_name, len(sampled_data)))
print ('Loaded %d templates' % len(accepted_templates))

Chemist Name: Shuan, Correcting 200 reaction data
Loaded 436 templates


In [5]:
# Manually check AAM
# 0: remap, 1: accept, 2: reject reaction
for i, (idx, rxn, temp) in enumerate(zip(sampled_data['data_idx'], sampled_data['mapped_rxn'], sampled_data['template'])): # remap: reject, 1: accept, 2: reject
    if idx in remapped_rxn_dict:
        continue
    r, p = rxn.split('>>')
    temp = extract_from_reaction(rxn)
    answer = '1'
    
    while True:
        if temp in accepted_templates: answer = '1'; break
        print (rxn)
        print ('Reactant: \n', r); print ('Template: \n', temp)
        save_reaction(rxn)
        display(Image.open('mol.png'))
        answer = input('Correct (%d/%d)?' % (i, len(sampled_data))) 
        if answer in ['1', '2']: 
            break
        remap = input('Remap (%d/%d)...' % (i, len(sampled_data)))
        if not is_valid_mapping(remap): 
            print ('Not valid mapping!'); continue
        else: 
            r = remap
        rxn = '%s>>%s' % (r, p)
        temp = extract_from_reaction(rxn)
     
    save_reaction(rxn)
    display(Image.open('mol.png'))
    if answer == '1':
        remapped_rxn_dict[idx] = rxn
        remapped_temp_dict[idx] = temp
        accepted_templates.add(temp)
    
    clear_output(wait=True)
    
print ('Correction finished. Mapped %d reactions.' % len(remapped_rxn_dict))

Correction finished. Mapped 200 reactions.


In [8]:
# Double check the accepted templates
import pprint 
new_accepted_templates = set(remapped_temp_dict.values())
pprint.pprint (new_accepted_templates)

{'Br-[C:1](-Br)=[C:2]>>[C:1]#[C:2]',
 'Br-[C:1].Br-[C:2]=C>>[C:1]-[C:2]',
 'Br-[C:1].C-C1(-C)-O-B(-[c:2])-O-C-1(-C)-C>>[C:1]-[c:2]',
 'Br-[C:1].C-O-N(-C)-[C:2]>>[C:1]-[C:2]',
 'Br-[C:1].C=[C:2]-[Mg+]>>[C:1]-[C:2]',
 'Br-[C:1].I-[c:2]>>[C:1]-[c:2]',
 'Br-[C:1].O-[C:2]=[O:3]>>[C:1]-[C:2]-[O:3]',
 'Br-[C:1].O=C-[O:2]>>[C:1]-[O:2]',
 'Br-[C:1].[C:2]=[O:3]>>[C:1]-[C:2]-[O:3]',
 'Br-[c:1].Br-[C:2]=C>>[C:2]-[c:1]',
 'Br-[c:1].C-C-C-C-[Sn](-C-C-C-C)(-C-C-C-C)-[C:2]#N>>[C:2]-[c:1]',
 'Br-[c:1].C-C-C-C-[Sn](-[c:2])(-C-C-C-C)-C-C-C-C>>[c:1]-[c:2]',
 'Br-[c:1].C-C-[Si](-C-C)(-C-C)-[C:2]#C>>[C:2]-[c:1]',
 'Br-[c:1].C-N1-C-C(=O)-O-B(-[c:2])-O-C(=O)-C-1>>[c:1]-[c:2]',
 'Br-[c:1].C-[O:2]-[P:3]>>[c:1]-[P:3]=[O:2]',
 'Br-[c:1].C-[Si](-C)(-C)-[C:2]#C>>[C:2]-[c:1]',
 'Br-[c:1].C=[C:2]-[Mg+]>>[C:2]-[c:1]',
 'Br-[c:1].O=C(-[O-])-C1-C=C-O-B(-[c:2])-O-1>>[c:1]-[c:2]',
 'Br-[c:1]>>[c:1]',
 'C#[C:1].C=[C:2]-O-S(=O)(=O)-C(-F)(-F)-F>>[C:1]-[C:2]',
 'C-C(-C)(-C)-O-C(=O)-[N:1]>>[N:1]',
 'C-C(-C)(-C)-[O:1]-C=O>>[O:1

In [9]:
# Sort the reaction idex before exporting
remapped_idxs, remapped_rxns, remapped_temps = [], [], []
for idx in sorted(list(remapped_temp_dict.keys())):
    remapped_idxs.append(idx)
    remapped_rxns.append(remapped_rxn_dict[idx])
    remapped_temps.append(remapped_temp_dict[idx])
df = pd.DataFrame({'data_idx': remapped_idxs, 'mapped_rxn': remapped_rxns, 'template': remapped_temps})
save_fixed_data(df, dataset, chemist_name, samp_iter)