In [16]:
import pandas as pd
import os
from rxnmapper import RXNMapper
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem

In [18]:
path = '/home/ruard/Documents/datasets/MA_reaxys_export/'
dfs = []
for name in ['MA_1.tsv', 'MA_2.tsv', 'MA_3.tsv', 'MA_4.tsv', 'MA_5.tsv']:
    dfs.append(pd.read_csv(os.path.join(path, name), sep='\t'))
    
df = pd.concat(dfs)
print(len(df))

df.columns

FileNotFoundError: [Errno 2] No such file or directory: '/home/ruard/Documents/datasets/MA_reaxys_export/MA_1.tsv'

In [60]:
# no multistep reactions
df = df[df['Multi-step Details'].isnull()]
print(len(df))

# no NaN reactions 
df = df[~df['Reaction'].isna()]
print(len(df))

# no duplicate reaction smiles
df = df.drop_duplicates(subset=['Reaction'])
print(len(df))

33446
33317
20002


In [61]:
# filter reactions with multiple unsaturated carbons
SMARTS = Chem.MolFromSmarts('[#6:1]=[#6:2][#6:3](=[O:4])') # C=CC(=O)C

filtered_reaction_ids = []
for reaction, reaction_id in zip(df['Reaction'].values, df['Reaction ID'].values):
    reactants, products = reaction.split('>>')
    if len(reactants.split('.')) == 2 and len(products.split('.')) == 1: 
        reactant1, reactant2 = reactants.split('.')
        reactant1 = Chem.MolFromSmiles(reactant1)
        reactant2 = Chem.MolFromSmiles(reactant2)
        
        if reactant1 is not None and reactant2 is not None:
            if len(reactant1.GetSubstructMatches(SMARTS)) >= 2 or len(reactant2.GetSubstructMatches(SMARTS)) >= 2:
                filtered_reaction_ids.append(reaction_id)

df = df[df['Reaction ID'].isin(filtered_reaction_ids)]
print(len(df))

1210


In [62]:
# see if there could be a regioselective thing
reaction_smarts = AllChem.ReactionFromSmarts(
    "[#6:1]=[#6:2][#6:3](=[O:4]).[N,S,n,s:5]>>[N,S,n,s:5][#6:1]-[#6:2][#6:3](=[O:4])"
)

def simulate_reaction(substrates):
    products = []
    products += reaction_smarts.RunReactants(substrates)
    substrates = [substrates[1], substrates[0]]
    products += reaction_smarts.RunReactants(substrates)
    
    products = [Chem.MolToSmiles(product[0]) for product in products]
    products = list(set(products))
    products = [Chem.MolFromSmiles(product) for product in products]
    return list(filter(lambda x: x is not None, products))

filtered_reaction_ids = []
idx = 0
for reaction, reaction_id in zip(df['Reaction'].values, df['Reaction ID'].values):
    reactants, products = reaction.split('>>')
    reactant1, reactant2 = reactants.split('.')
    reactant1 = Chem.MolFromSmiles(reactant1)
    reactant2 = Chem.MolFromSmiles(reactant2)

    out = simulate_reaction([reactant1, reactant2])
    if len(out) >= 2:
        filtered_reaction_ids.append(reaction_id)
        
df = df[df['Reaction ID'].isin(filtered_reaction_ids)]
print(len(df))

413


In [63]:
# filter out reaction where product has 30+ heavy atoms
filtered_reaction_ids = []
idx = 0
for reaction, reaction_id in zip(df['Reaction'].values, df['Reaction ID'].values):
    reactants, products = reaction.split('>>')
    product = Chem.MolFromSmiles(products)
    if product.GetNumHeavyAtoms() < 50:
        filtered_reaction_ids.append(reaction_id)
    
df = df[df['Reaction ID'].isin(filtered_reaction_ids)]
print(len(df))

382


In [15]:
from autode.solvent.solvents import solvents

SOLVENTS = solvents

def get_solvent(reaction):
    return str(solvent_dict[reaction])

def check_if_solvent_available_in_xtb(solvent_string):
    if ';' in solvent_string:
        solvent_strings = solvent_string.split(';')
    else:
        solvent_strings = [solvent_string]
    
    for string in solvent_strings:
        for solvent in SOLVENTS:
            if solvent_string in solvent.aliases:
                if hasattr(solvent, 'xtb'):
                    return True
    return False

def get_xtb_solvent(solvent_string):
    for solvent in SOLVENTS:
        if solvent_string in solvent.aliases:
            return solvent.xtb

In [64]:
def normalize_mol(mol):
    return Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=False))


reaction_idx = []
substrates = []
products = []
solvents = []
reaction_smiles_list = []
labels = []

for idx, (reaction_smiles, solvent) in enumerate(zip(df['Reaction'].values, df['Solvent (Reaction Details)].values)):
    reactants, product = reaction_smiles.split('>>')
    reactant1_smiles, reactant2_smiles = reactants.split('.')
    
    reactant1 = Chem.MolFromSmiles(reactant1_smiles)
    reactant2 = Chem.MolFromSmiles(reactant2_smiles)
    product = Chem.MolFromSmiles(product)
    reactants = [reactant1, reactant2]
    output = simulate_reaction(reactants)
    
    product_smiles = Chem.MolToSmiles(normalize_mol(product), isomericSmiles=False)
    output = [Chem.MolToSmiles(normalize_mol(mol), isomericSmiles=False) for mol in output]
    output = list(set(output))
    
    if len(output) >= 2 and product_smiles in output and check_if_solvent_available_in_xtb(solvent):
        for mol in output:
            reaction_smiles = f"{reactant1_smiles}.{reactant2_smiles}>>{mol}"
            substrates.append(f"{reactant1_smiles}.{reactant2_smiles}")
            products.append(mol)
            solvents.append(get_xtb_solvent(solvent))
            reaction_smiles_list.append(reaction_smiles)
            reaction_idx.append(idx)
            if mol == product_smiles:
                labels.append(1)
            else:
                labels.append(0)   
    
print(len(reaction_idx), len(set(reaction_idx)))    
    
df = pd.DataFrame({
    'reaction_idx': reaction_idx,
    'uid': np.arange(len(reaction_idx)),
    'substrates': substrates,
    'products': products,
    'solvent': solvents,
    'reaction_smiles': reaction_smiles_list,
    'label': labels,
    'simulation_idx': np.zeros(len(reaction_idx))
})    
    
df.to_csv('/home/ruard/code/virtual_reactions/data/datasets/ma/ma_dataset_solvent.csv')

826 362


In [1]:
import pandas as pd
import os
from rxnmapper import RXNMapper
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem

from src.reactions.ma.ma_reaction import MAReaction
from src.methods.methods import XtbMethod

In [2]:
df = pd.read_csv('/home/ruard/code/virtual_reactions/data/datasets/ma/ma_dataset.csv')

reaction = MAReaction(
    substrate_smiles=df['substrates'].values[0],
    product_smiles=df['products'].values[0],
    solvent=None,
    method=XtbMethod(),
    has_openmm_compatability=False,
    compute_product_only=False
)

out = reaction._get_transition_state()

In [None]:
energies = reaction.compute_conformer_energies()
energies

In [1]:
import os
from src.dataset import Dataset
from sklearn.metrics import roc_auc_score
import numpy as np
import matplotlib.pyplot as plt

In [8]:
from src.reactions.ma.ma_dataset import XtbSimulatedMADataset

dataset = XtbSimulatedMADataset(
    csv_file_path="ma/xtb_simulated_ma_dataset.csv"
)

df = dataset.load(
    aggregation_mode='low',
    margin=0 / 627.5
)

df
# targets, preds = [], []
# for idx in df['reaction_idx'].unique():
#     target = df[(df['reaction_idx'] == idx) & (df['simulation_idx'] == 0)]['label']
#     pred = df[(df['reaction_idx'] == idx) & (df['simulation_idx'] == 1)]['label']

#     if len(pred) > 0 and len(target) > 0:
#         targets.append(target.values[0])
#         preds.append(pred.values[0])

# score = roc_auc_score(targets, preds)

# score

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,reaction_idx,uid,substrates,products,reaction_smiles,label,simulation_idx,barrier
0,0,0.0,0,0,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=C(C)C(=O)OCC(O)COC(=O)CCNC(CS)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,
1,1,1.0,0,1,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=C(C)C(=O)OCC(O)COC(=O)CCSCC(N)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,1.0,0.0,
2,2,2.0,0,2,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=CC(=O)OCC(O)COC(=O)C(C)CNC(CS)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,
3,3,3.0,0,3,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=CC(=O)OCC(O)COC(=O)C(C)CSCC(N)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,
4,4,4.0,1,4,COC1=CC=CC=C1S.CC1=C2[C@H]3OC(=O)C(=C)[C@@H]3C...,COc1ccccc1SCC1C(=O)OC2C3=C(C)C(=O)C=CC3(C)CCC12,COC1=CC=CC=C1S.CC1=C2[C@H]3OC(=O)C(=C)[C@@H]3C...,1.0,0.0,
...,...,...,...,...,...,...,...,...,...,...
1647,821,,380,1647,[H][C@@]12[C@@H](O)[C@@H]([C@H](C[C@@]1(COC(=O...,C=CC12COC(=O)C(CSCCN)C1C(O)C(C(=C)C(=O)OC)C(OC...,,0.0,1.0,-57.579164
1648,822,,380,1648,[H][C@@]12[C@@H](O)[C@@H]([C@H](C[C@@]1(COC(=O...,C=CC12COC(=O)C(CNCCS)C1C(O)C(C(=C)C(=O)OC)C(OC...,,1.0,1.0,-57.733770
1649,823,,380,1649,[H][C@@]12[C@@H](O)[C@@H]([C@H](C[C@@]1(COC(=O...,C=CC12COC(=O)C(=C)C1C(O)C(C(CSCCN)C(=O)OC)C(OC...,,0.0,1.0,-57.567405
1650,824,,381,1650,COC(=O)C=CC(=O)NC1=CC2=CC=CC=C2OC1=O.OCCS,COC(=O)CC(SCCO)C(=O)Nc1cc2ccccc2oc1=O,,1.0,1.0,-35.500393


In [12]:
df.iloc[:4]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,reaction_idx,uid,substrates,products,reaction_smiles,label,simulation_idx,barrier
0,0,0.0,0,0,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=C(C)C(=O)OCC(O)COC(=O)CCNC(CS)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,
1,1,1.0,0,1,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=C(C)C(=O)OCC(O)COC(=O)CCSCC(N)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,1.0,0.0,
2,2,2.0,0,2,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=CC(=O)OCC(O)COC(=O)C(C)CNC(CS)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,
3,3,3.0,0,3,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=CC(=O)OCC(O)COC(=O)C(C)CSCC(N)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,


In [14]:
df[df['reaction_idx'] == 0]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,reaction_idx,uid,substrates,products,reaction_smiles,label,simulation_idx,barrier
0,0,0.0,0,0,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=C(C)C(=O)OCC(O)COC(=O)CCNC(CS)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,
1,1,1.0,0,1,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=C(C)C(=O)OCC(O)COC(=O)CCSCC(N)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,1.0,0.0,
2,2,2.0,0,2,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=CC(=O)OCC(O)COC(=O)C(C)CNC(CS)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,
3,3,3.0,0,3,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=CC(=O)OCC(O)COC(=O)C(C)CSCC(N)C(=O)O,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C...,0.0,0.0,
826,0,,0,826,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=C(C)C(=O)OCC(O)COC(=O)CCNC(CS)C(=O)O,,0.0,1.0,35.260702
827,1,,0,827,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=C(C)C(=O)OCC(O)COC(=O)CCSCC(N)C(=O)O,,0.0,1.0,35.24993
828,2,,0,828,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=CC(=O)OCC(O)COC(=O)C(C)CNC(CS)C(=O)O,,0.0,1.0,35.126455
829,3,,0,829,[H][C@](N)(CS)C(O)=O.CC(=C)C(=O)OCC(O)COC(=O)C=C,C=CC(=O)OCC(O)COC(=O)C(C)CSCC(N)C(=O)O,,1.0,1.0,35.114545
