In [88]:
import pandas as pd
import os
from rxnmapper import RXNMapper
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem

In [89]:
path = '/home/ruard/Documents/datasets/DA_reaxys_export/'
dfs = []
for name in ['DA_1.tsv', 'DA_2.tsv', 'DA_3.tsv', 'DA_4.tsv']:
    dfs.append(pd.read_csv(os.path.join(path, name), sep='\t'))
    
df = pd.concat(dfs)
print(len(df))

29261


In [90]:
da_rxn_smarts = AllChem.ReactionFromSmarts(
    '[#6:1]=[#6:2].[#6:3]=[#6:4][#6:5]=[#6:6]>>[#6:1]1[#6:2][#6:3][#6:4]=[#6:5][#6:6]1'
)

# diene_smarts = Chem.MolFromSmarts('[C,c,N,n,O]=[C,c,N,n][C,c,N,n]=[C,c,N,n,O]')
# dienophile_smarts = Chem.MolFromSmarts('[C,c,N,n]=[C,c,N,n]')
diene_smarts = Chem.MolFromSmarts('[C,c]=[C,c][C,c]=[C,c]')
dienophile_smarts = Chem.MolFromSmarts('[C,c]=[C,c]')

def simulate_da_reaction(substrates):
    products = []
    products += da_rxn_smarts.RunReactants(substrates)
    substrates = [substrates[1], substrates[0]]
    products += da_rxn_smarts.RunReactants(substrates)
    
    products = [Chem.MolToSmiles(product[0]) for product in products]
    products = list(set(products))
    return [Chem.MolFromSmiles(product) for product in products]

In [91]:
# no multistep reactions
df = df[df['Multi-step Details'].isnull()]
print(len(df))

# no NaN reactions 
df = df[~df['Reaction'].isna()]
print(len(df))

# only bimolecular reactions involving dienes and dienophiles 
filtered_reaction_ids = []
for reaction, reaction_id in zip(df['Reaction'].values, df['Reaction ID'].values):
    reactants, products = reaction.split('>>')
    if len(reactants.split('.')) == 2 and len(products.split('.')) == 1: 
        reactant1, reactant2 = reactants.split('.')
        reactant1 = Chem.MolFromSmiles(reactant1)
        reactant2 = Chem.MolFromSmiles(reactant2)
        
        if None not in [reactant1, reactant2]:
            da_products = simulate_da_reaction([reactant1, reactant2])
            if len(da_products) > 0: 
                filtered_reaction_ids.append(reaction_id)
        
df = df[df['Reaction ID'].isin(filtered_reaction_ids)]
print(len(df))

# no duplicate reaction smiles
df = df.drop_duplicates(subset=['Reaction'])
print(len(df))

29241
28349
6519
4467


In [92]:
df.to_csv('/home/ruard/Documents/datasets/DA_reaxys_export/DA_raw.csv')

## Regioselectivity filter

In [None]:
df =to_csv('/home/ruard/Documents/datasets/DA_reaxys_export/DA_raw.csv')

In [97]:
# first let's remove stereochemistry to avoid having stereoisomer duplicates
reaction_smiles_list = df['Reaction'].values

def normalize_mol(mol):
    return Chem.MolFromSmiles(Chem.MolToSmiles(mol))

new_reaction_smiles_list = []

for reaction_smiles in reaction_smiles_list:
    reactants, product = reaction_smiles.split('>>')
    reactant1, reactant2 = reactants.split('.')
    reactant1 = Chem.MolFromSmiles(reactant1)
    reactant2 = Chem.MolFromSmiles(reactant2)
    product = Chem.MolFromSmiles(product)
    
    if None not in [reactant1, reactant2, product]:
        Chem.RemoveStereochemistry(reactant1)
        Chem.RemoveStereochemistry(reactant2)
        Chem.RemoveStereochemistry(product)

        new_smiles = f'{Chem.MolToSmiles(reactant1)}.{Chem.MolToSmiles(reactant2)}>>{Chem.MolToSmiles(product)}'
        new_reaction_smiles_list.append(new_smiles)

print(len(new_reaction_smiles_list))
new_reaction_smiles_list = list(set(new_reaction_smiles_list))
print(len(new_reaction_smiles_list))

# now we have to figure out if both motifs are symmetrically substituted
new_new_reaction_smiles_list = []
for item in new_reaction_smiles_list:
    reactant1, reactant2 = item.split('>>')[0].split('.')
    reactant1 = Chem.MolFromSmiles(reactant1)
    reactant2 = Chem.MolFromSmiles(reactant2)
    reactants = [reactant1, reactant2]
    output = simulate_da_reaction(reactants)
    if len(output) >= 2:
        output = [Chem.MolToSmiles(normalize_mol(mol)) for mol in output]
        if len(set(output)) >= 2:
            new_new_reaction_smiles_list.append(item)
            
print(len(new_new_reaction_smiles_list))

KeyError: 'Reaction'

In [98]:
reaction_idx = []
substrates = []
products = []
reaction_smiles_list = []
labels = []

for idx, regio_reaction in enumerate(new_new_reaction_smiles_list):
    reactants, product = regio_reaction.split('>>')
    reactant1_smiles, reactant2_smiles = reactants.split('.')
    
    reactant1 = Chem.MolFromSmiles(reactant1_smiles)
    reactant2 = Chem.MolFromSmiles(reactant2_smiles)
    product = Chem.MolFromSmiles(product)
    reactants = [reactant1, reactant2]
    output = simulate_da_reaction(reactants)
    
    product_smiles = Chem.MolToSmiles(normalize_mol(product))
    output = [Chem.MolToSmiles(normalize_mol(mol)) for mol in output]
    
    matches = sum([mol == product_smiles for mol in output])
    if matches == 1 and product.GetNumHeavyAtoms() < 25:
        for mol in output:
            reaction_smiles = f"{reactant1_smiles}.{reactant2_smiles}>>{mol}"
            substrates.append(f"{reactant1_smiles}.{reactant2_smiles}")
            products.append(mol)
            reaction_smiles_list.append(reaction_smiles)
            reaction_idx.append(idx)
            if mol == product_smiles:
                labels.append(1)
            else:
                labels.append(0)   
    
df = pd.DataFrame({
    'reaction_idx': reaction_idx,
    'uid': np.arange(len(reaction_idx)),
    'substrates': substrates,
    'products': products,
    'reaction_smiles': reaction_smiles_list,
    'labels': labels,
    'simulation_idx': np.zeros(len(reaction_idx))
})    
    
df.to_csv('/home/ruard/Documents/datasets/DA_reaxys_export/DA_regio_dataset_v1.csv')