In [24]:
import pandas as pd
import os
from rxnmapper import RXNMapper
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem

In [31]:
path = '/home/ruard/Documents/datasets/DA_reaxys_export/'
dfs = []
for name in ['DA_1.tsv', 'DA_2.tsv', 'DA_3.tsv', 'DA_4.tsv']:
    dfs.append(pd.read_csv(os.path.join(path, name), sep='\t'))
    
df = pd.concat(dfs)
print(len(df))

df.columns

29261


Index(['Reaction ID', 'Reaction: Links to Reaxys', 'Data Count',
       'Number of Reaction Details', 'Reaction Rank', 'Record Type',
       'Reactant', 'Product', 'Bin', 'Reaction',
       'Reaction Details: Reaction Classification', 'Example label',
       'Example title', 'Fulltext of reaction', 'Number of Reaction Steps',
       'Multi-step Scheme', 'Multi-step Details', 'Number of Stages',
       'Solid Phase', 'Time (Reaction Details) [h]',
       'Temperature (Reaction Details) [C]',
       'Pressure (Reaction Details) [Torr]', 'pH-Value (Reaction Details)',
       'Other Conditions', 'Reaction Type', 'Subject Studied',
       'Prototype Reaction', 'Named Reaction',
       'Type of reaction description (Reaction Details)', 'Location',
       'Comment (Reaction Details)', 'Product.1', 'Yield', 'Yield (numerical)',
       'Yield (optical)', 'Stage Reactant', 'Reagent', 'Catalyst',
       'Solvent (Reaction Details)', 'References', 'Links to Reaxys',
       'Unnamed: 41'],
      dt

In [45]:
da_rxn_smarts = AllChem.ReactionFromSmarts(
    '[#6:1]=[#6:2].[#6:3]=[#6:4][#6:5]=[#6:6]>>[#6:1]1[#6:2][#6:3][#6:4]=[#6:5][#6:6]1'
)

# diene_smarts = Chem.MolFromSmarts('[C,c,N,n,O]=[C,c,N,n][C,c,N,n]=[C,c,N,n,O]')
# dienophile_smarts = Chem.MolFromSmarts('[C,c,N,n]=[C,c,N,n]')
diene_smarts = Chem.MolFromSmarts('[C,c]=[C,c][C,c]=[C,c]')
dienophile_smarts = Chem.MolFromSmarts('[C,c]=[C,c]')

def simulate_da_reaction(substrates):
    products = []
    products += da_rxn_smarts.RunReactants(substrates)
    substrates = [substrates[1], substrates[0]]
    products += da_rxn_smarts.RunReactants(substrates)
    
    products = [Chem.MolToSmiles(product[0]) for product in products]
    products = list(set(products))
    return [Chem.MolFromSmiles(product) for product in products]

products = simulate_da_reaction([Chem.MolFromSmiles(smi) for smi in ["C=C", "C=CC=C"]])
Chem.MolToSmiles(products[0])

'C1=CCCCC1'

In [91]:
# no multistep reactions
df = df[df['Multi-step Details'].isnull()]
print(len(df))

# no NaN reactions 
df = df[~df['Reaction'].isna()]
print(len(df))

# only bimolecular reactions involving dienes and dienophiles 
filtered_reaction_ids = []
for reaction, reaction_id in zip(df['Reaction'].values, df['Reaction ID'].values):
    reactants, products = reaction.split('>>')
    if len(reactants.split('.')) == 2 and len(products.split('.')) == 1: 
        reactant1, reactant2 = reactants.split('.')
        reactant1 = Chem.MolFromSmiles(reactant1)
        reactant2 = Chem.MolFromSmiles(reactant2)
        
        if None not in [reactant1, reactant2]:
            da_products = simulate_da_reaction([reactant1, reactant2])
            if len(da_products) > 0: 
                filtered_reaction_ids.append(reaction_id)
        
df = df[df['Reaction ID'].isin(filtered_reaction_ids)]
print(len(df))

# no duplicate reaction smiles
df = df.drop_duplicates(subset=['Reaction'])
print(len(df))

29241
28349
6519
4467


In [92]:
df.to_csv('/home/ruard/Documents/datasets/DA_reaxys_export/DA_raw.csv')

## Regioselectivity filter

In [32]:
df = pd.read_csv('/home/ruard/Documents/datasets/DA_reaxys_export/DA_raw.csv')

In [None]:
# step 1. remove stereochemistry
df.insert()

In [40]:
for val in df['Yield (numerical)'].values:
    if str(val) != 'nan':
        try:
            print(float(val))
        except:
            print(float(val.split(';')[0]))

97.0
98.0
99.0
97.0
99.0
98.0
84.0
98.0
87.0
76.0
75.0
96.0
97.9
94.0
98.0
100.0
100.0
90.0
99.0
100.0
99.0
72.0
88.0
80.0
72.0
97.0
98.0
91.0
85.0
96.0
98.0
93.0
98.0
91.0
99.0
99.0
99.0
80.0
83.0
76.0
83.0
98.0
97.0
70.0
98.0
96.0
73.0
86.0
100.0
93.0
100.0
92.0
98.0
90.0
91.0
99.0
68.0
66.0
94.0
94.0
85.0
96.1
100.0
97.0
99.0
99.0
97.0
97.0
99.0
85.0
100.0
99.0
99.0
88.0
95.0
63.0
98.0
76.0
81.0
90.0
61.0
79.0
75.0
92.0
100.0
99.0
99.0
100.0
97.0
100.0
93.0
99.0
98.0
96.0
98.0
72.0
72.0
58.0
98.0
98.0
72.0
73.0
66.0
80.0
70.0
76.0
98.0
86.0
96.0
88.0
92.0
100.0
98.0
95.0
99.0
99.0
99.0
93.0
99.0
90.0
85.0
86.0
89.0
97.0
99.0
81.0
86.0
92.0
99.0
85.0
57.0
99.0
64.0
64.0
99.0
99.0
99.0
94.0
99.0
89.0
79.0
82.0
90.0
100.0
100.0
86.0
93.0
51.0
50.0
94.2
84.0
50.0
53.0
50.0
83.0
68.0
75.0
84.0
91.0
93.0
88.0
86.0
91.0
92.0
93.0
88.0
86.0
88.0
97.0
90.0
90.0
86.0
94.0
98.0
96.0
95.0
94.0
99.0
77.0
90.0
96.0
80.0
92.0
87.0
87.0
91.0
91.0
96.0
99.0
99.0
96.0
97.0
93.0
95.0
97.0
100.0
94.0
8

In [8]:
# first let's remove stereochemistry to avoid having stereoisomer duplicates
reaction_smiles_list = df['Reaction'].values
solvents = df['Solvent (Reaction Details)'].values

def normalize_mol(mol):
    return Chem.MolFromSmiles(Chem.MolToSmiles(mol))

new_reaction_smiles_list = []
solvent_dict = {}

for reaction_smiles, solvent in zip(reaction_smiles_list, solvents):
    reactants, product = reaction_smiles.split('>>')
    reactant1, reactant2 = reactants.split('.')
    reactant1 = Chem.MolFromSmiles(reactant1)
    reactant2 = Chem.MolFromSmiles(reactant2)
    product = Chem.MolFromSmiles(product)
    
    if None not in [reactant1, reactant2, product]:
        Chem.RemoveStereochemistry(reactant1)
        Chem.RemoveStereochemistry(reactant2)
        Chem.RemoveStereochemistry(product)

        new_smiles = f'{Chem.MolToSmiles(reactant1)}.{Chem.MolToSmiles(reactant2)}>>{Chem.MolToSmiles(product)}'
        
        # add solvent to dict
        if new_smiles not in solvent_dict.keys():
            solvent_dict[new_smiles] = solvent
        else:
            if str(solvent_dict[new_smiles]) == 'nan':
                solvent_dict[new_smiles] = solvent 
        
        # add reaction smiles
        new_reaction_smiles_list.append(new_smiles)

print(len(new_reaction_smiles_list))
new_reaction_smiles_list = list(set(new_reaction_smiles_list))
print(len(new_reaction_smiles_list))

# now we have to figure out if both motifs are symmetrically substituted
new_new_reaction_smiles_list = []
for item in new_reaction_smiles_list:
    reactant1, reactant2 = item.split('>>')[0].split('.')
    reactant1 = Chem.MolFromSmiles(reactant1)
    reactant2 = Chem.MolFromSmiles(reactant2)
    reactants = [reactant1, reactant2]
    output = simulate_da_reaction(reactants)
    if len(output) >= 2:
        output = [Chem.MolToSmiles(normalize_mol(mol)) for mol in output]
        if len(set(output)) >= 2:
            new_new_reaction_smiles_list.append(item)
            
print(len(new_new_reaction_smiles_list))

4467
4175
1691


In [None]:
# test to see if there are reactions with same substrates
for 

In [21]:
from autode.solvent.solvents import solvents

SOLVENTS = solvents

def get_solvent(reaction):
    return str(solvent_dict[reaction])

def check_if_solvent_available_in_xtb(solvent_string):
    if ';' in solvent_string:
        solvent_strings = solvent_string.split(';')
    else:
        solvent_strings = [solvent_string]
    
    for string in solvent_strings:
        for solvent in SOLVENTS:
            if solvent_string in solvent.aliases:
                if hasattr(solvent, 'xtb'):
                    return True
    return False

def get_xtb_solvent(solvent_string):
    for solvent in SOLVENTS:
        if solvent_string in solvent.aliases:
            return solvent.xtb

In [23]:
reaction_idx = []
substrates = []
products = []
solvents = []
reaction_smiles_list = []
labels = []

for idx, regio_reaction in enumerate(new_new_reaction_smiles_list):
    reactants, product = regio_reaction.split('>>')
    reactant1_smiles, reactant2_smiles = reactants.split('.')
    
    reactant1 = Chem.MolFromSmiles(reactant1_smiles)
    reactant2 = Chem.MolFromSmiles(reactant2_smiles)
    product = Chem.MolFromSmiles(product)
    reactants = [reactant1, reactant2]
    output = simulate_da_reaction(reactants)
    
    product_smiles = Chem.MolToSmiles(normalize_mol(product))
    output = [Chem.MolToSmiles(normalize_mol(mol)) for mol in output]
    
    matches = sum([mol == product_smiles for mol in output])
    if matches == 1 and product.GetNumHeavyAtoms() < 25 and check_if_solvent_available_in_xtb(get_solvent(regio_reaction)):
        for mol in output:
            reaction_smiles = f"{reactant1_smiles}.{reactant2_smiles}>>{mol}"
            substrates.append(f"{reactant1_smiles}.{reactant2_smiles}")
            products.append(mol)
            solvents.append(get_xtb_solvent(get_solvent(regio_reaction)))
            reaction_smiles_list.append(reaction_smiles)
            reaction_idx.append(idx)
            if mol == product_smiles:
                labels.append(1)
            else:
                labels.append(0)   
    
print(len(reaction_idx), len(set(reaction_idx)))    
    
df = pd.DataFrame({
    'reaction_idx': reaction_idx,
    'uid': np.arange(len(reaction_idx)),
    'substrates': substrates,
    'products': products,
    'solvent': solvents,
    'reaction_smiles': reaction_smiles_list,
    'label': labels,
    'simulation_idx': np.zeros(len(reaction_idx))
})    
    
df.to_csv('/home/ruard/Documents/datasets/DA_reaxys_export/DA_regio_dataset_v1.csv')

714 326
