In [11]:
import pandas as pd
import numpy as np

import psycopg2
with open('/home/pstjohn/bdeops_pass', 'r') as f:
    passwd = f.read().strip()

dbparams = {
    'dbname': 'bde',
    'port': 5432,
    'host': 'yuma.hpc.nrel.gov',
    'user': 'bdeops',
    'password': passwd,
    'options': f'-c search_path=bde',
}

In [16]:
from tqdm import tqdm
from bde.fragment import fragment_iterator

In [21]:
smiles_to_add = pd.Series([
    'FC(F)(F)F', 
    'FC(C)(F)F', 
    'FC(C)(C)F', 
    'CC(C)(C)F', 
    'ClC(Cl)(Cl)Cl', 
    'ClC(C)(Cl)Cl', 
    'ClC(C)(C)Cl', 
    'CC(C)(C)C', 
    'BrC(Br)(Br)Br', 
    'BrC(C)(Br)Br', 
    'BrC(C)(C)Br', 
    'CC(C)(C)Br', 
    'IC(I)(I)I', 
    'IC(C)(I)I', 
    'IC(C)(C)I', 
    'CC(C)(C)I', 
    'FC(F)F', 
    'FCF', 
    'CF', 
    'ClC(Cl)Cl',
    'ClCCl', 
    'CCl', 
    'BrC(Br)Br', 
    'BrCBr', 
    'CBr', 
    'IC(I)I', 
    'ICI', 
    'CI'
])

In [25]:
smiles_to_add = smiles_to_add.apply(lambda x: rdkit.Chem.MolToSmiles(rdkit.Chem.MolFromSmiles(x)))
smiles_to_add.duplicated().any()

False

In [31]:
def ysi_frag_iterator():
    for smiles in tqdm(smiles_to_add):
        for row in fragment_iterator(smiles):
            yield row

rdf_to_add = pd.DataFrame(ysi_frag_iterator())
rdf_to_add = rdf_to_add[rdf_to_add.is_valid_stereo]

100%|██████████| 28/28 [00:05<00:00,  4.74it/s]


In [26]:
cdf_to_add = rdf_to_add[['molecule', 'fragment1', 'fragment2']].drop_duplicates().melt(
    value_name='SMILES', var_name='Type').drop_duplicates(subset='SMILES').reset_index(drop=True)

# Reorder the cdf such that reactions are collocated
calc_order = rdf_to_add[['molecule', 'fragment1', 'fragment2']].values.flatten()
calc_order = pd.Series(calc_order, name='SMILES').drop_duplicates()

cdf_to_add = cdf_to_add.set_index('SMILES').reindex(calc_order).reset_index()
cdf_to_add.Type = cdf_to_add.Type.str.replace('\d$', '')
cdf_to_add.head()

Unnamed: 0,SMILES,Type
0,FC(F)(F)F,molecule
1,F[C](F)F,fragment
2,[F],fragment
3,FC(C)(F)F,molecule
4,C[C](F)F,fragment


In [27]:
with psycopg2.connect(**dbparams) as conn:
    rdf = pd.read_sql_query("""
    SELECT *
    FROM reaction
    ORDER BY rid
    ;
    """, conn)
    
with psycopg2.connect(**dbparams) as conn:
    cdf = pd.read_sql_query("""
    SELECT *
    FROM unique_compound
    where status='finished'
    ;
    """, conn)

In [36]:
cdf_to_add2 = cdf_to_add[~cdf_to_add.SMILES.isin(cdf.smiles)]

with psycopg2.connect(**dbparams) as conn:
    with conn.cursor() as cur:
        for i, row in tqdm(cdf_to_add2.iterrows(), total=len(cdf_to_add2)):
            cur.execute("""
            INSERT INTO Compound (smiles, type, run)
            VALUES (%s, %s, 21) RETURNING id
            """, (row.SMILES, row.Type))

100%|██████████| 40/40 [00:00<00:00, 1185.67it/s]


In [40]:
# Make sure we're not adding duplicate reactions
merged_rdf = rdf_to_add.merge(rdf, on=['molecule', 'bond_index'])
rdf_to_add2 = rdf_to_add[~rdf_to_add.index.isin(merged_rdf.index)]

In [41]:
with psycopg2.connect(**dbparams) as conn:
    with conn.cursor() as cur:
        for i, row in tqdm(rdf_to_add2.iterrows(), total=len(rdf_to_add2)):
            cur.execute("""
            INSERT INTO reaction (molecule, bond_index, fragment1, fragment2, bond_type)
            VALUES (%s, %s, %s, %s, %s)
            """, (row.molecule, row.bond_index, row.fragment1, row.fragment2, row.bond_type))

100%|██████████| 171/171 [00:00<00:00, 1736.29it/s]
