In [None]:
!pip install rdkit-pypi==2022.3.1
!pip install pip install OpenNMT-py==2.2.0

Collecting rdkit-pypi==2022.3.1
  Downloading rdkit_pypi-2022.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.5 MB)
[K     |████████████████████████████████| 22.5 MB 1.5 MB/s 
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.3.1
Collecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting OpenNMT-py==2.2.0
  Downloading OpenNMT_py-2.2.0-py3-none-any.whl (216 kB)
[K     |████████████████████████████████| 216 kB 3.6 MB/s 
[?25hCollecting configargparse
  Downloading ConfigArgParse-1.5.3-py3-none-any.whl (20 kB)
Collecting torchtext==0.5.0
  Downloading torchtext-0.5.0-py3-none-any.whl (73 kB)
[K     |████████████████████████████████| 73 kB 901 kB/s 
[?25hCollecting waitress
  Downloading waitress-2.1.1-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 4.6 MB/s 
Collecting pyonmttok<2,>=1.23
  Downloading pyonmttok-1.31.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (16.6 MB)

In [None]:
import gdown
import os
import random
import re

import pandas as pd

from tqdm import tqdm
from rdkit import Chem

# to display molecules
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.ipython_useSVG=True

In [None]:
def download_data():
  # links from https://github.com/coleygroup/Graph2SMILES/blob/main/scripts/download_raw_data.py
  USPTO_480k_links= [
            ("https://drive.google.com/uc?id=1RysNBvB2rsMP0Ap9XXi02XiiZkEXCrA8", "src-train.txt"),
            ("https://drive.google.com/uc?id=1CxxcVqtmOmHE2nhmqPFA6bilavzpcIlb", "tgt-train.txt"),
            ("https://drive.google.com/uc?id=1FFN1nz2yB4VwrpWaBuiBDzFzdX3ONBsy", "src-val.txt"),
            ("https://drive.google.com/uc?id=1pYCjWkYvgp1ZQ78EKQBArOvt_2P1KnmI", "tgt-val.txt"),
            ("https://drive.google.com/uc?id=10t6pHj9yR8Tp3kDvG0KMHl7Bt_TUbQ8W", "src-test.txt"),
            ("https://drive.google.com/uc?id=1FeGuiGuz0chVBRgePMu0pGJA4FVReA-b", "tgt-test.txt")
        ]
  data_path = 'USPTO_480k'
  os.makedirs(data_path, exist_ok=True)
  for url, name in USPTO_480k_links:
    target_path = os.path.join(data_path, name)
    if not os.path.exists(target_path):
      gdown.download(url, target_path, quiet=False)
    else:
      print(f"{target_path} already exists")

def canonicalize_smiles(smiles): # will raise an Exception if invalid SMILES
  return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))


In [None]:
!rm -rf sample_data
download_data()

USPTO_480k/src-train.txt already exists
USPTO_480k/tgt-train.txt already exists
USPTO_480k/src-val.txt already exists
USPTO_480k/tgt-val.txt already exists
USPTO_480k/src-test.txt already exists
USPTO_480k/tgt-test.txt already exists


In [None]:
!head USPTO_480k/src-train.txt

C 1 C C O C 1 . C C ( C ) C [Mg+] . C O N ( C ) C ( = O ) c 1 c c c ( O ) n c 1 . [Cl-]
C N . O . O = C ( O ) c 1 c c c ( Cl ) c ( [N+] ( = O ) [O-] ) c 1
C C n 1 c c ( C ( = O ) O ) c ( = O ) c 2 c c ( F ) c ( - c 3 c c c ( N ) c c 3 ) c c 2 1 . O = C O
C C ( C ) = C ( Cl ) N ( C ) C . C O C C ( C ) O c 1 c c ( O c 2 c n c ( C ( = O ) N 3 C C C 3 ) c n 2 ) c c ( C ( = O ) O ) c 1 . C c 1 c n c ( N ) c n 1 . Cl C Cl . c 1 c c n c c 1
Cl c 1 c c 2 c ( Cl ) n c ( - c 3 c c n c c 3 ) n c 2 s 1 . N C c 1 c c c ( Cl ) c ( Cl ) c 1
C C ( = O ) O . C c 1 c ( Cl ) n n c ( C ( C # N ) c 2 c c c ( F ) c ( C # N ) c 2 ) c 1 C . Cl . O
C C ( N ) c 1 c c c ( F ) c ( Cl ) c 1 . N C 1 C C c 2 c c ( F ) c c c 2 1 . O = C ( N 1 C C c 2 c c c ( Cl ) c ( O S ( = O ) ( = O ) C ( F ) ( F ) F ) c 2 C C 1 ) C ( F ) ( F ) F
C C ( C ) N 1 C C N ( C ( = O ) c 2 c c c 3 [nH] c ( C ( = O ) N 4 C C N ( S ( C ) ( = O ) = O ) C C 4 ) c c 3 c 2 ) C C 1 . C C O C ( = O ) N 1 C C N C C 1
C C ( C ( = O ) O ) C ( = O ) N

In [None]:
!head USPTO_480k/tgt-train.txt

C C ( C ) C C ( = O ) c 1 c c c ( O ) n c 1
C N c 1 c c c ( C ( = O ) O ) c c 1 [N+] ( = O ) [O-]
C C n 1 c c ( C ( = O ) O ) c ( = O ) c 2 c c ( F ) c ( - c 3 c c c ( N C = O ) c c 3 ) c c 2 1
C O C C ( C ) O c 1 c c ( O c 2 c n c ( C ( = O ) N 3 C C C 3 ) c n 2 ) c c ( C ( = O ) N c 2 c n c ( C ) c n 2 ) c 1
Cl c 1 c c 2 c ( N C c 3 c c c ( Cl ) c ( Cl ) c 3 ) n c ( - c 3 c c n c c 3 ) n c 2 s 1
C c 1 c ( Cl ) n n c ( C c 2 c c c ( F ) c ( C # N ) c 2 ) c 1 C
C C ( N c 1 c ( Cl ) c c c 2 c 1 C C N ( C ( = O ) C ( F ) ( F ) F ) C C 2 ) c 1 c c c ( F ) c ( Cl ) c 1
C C O C ( = O ) N 1 C C N ( C ( = O ) c 2 c c 3 c c ( C ( = O ) N 4 C C N ( C ( C ) C ) C C 4 ) c c c 3 [nH] 2 ) C C 1
C C ( C ( = O ) N C c 1 c c c ( F ) c c 1 ) C ( = O ) N C 1 C ( = O ) N ( C ) c 2 c c c c c 2 - c 2 c c c c c 2 1
C C ( = O ) N 1 C C N ( c 2 c c c ( N C ( = O ) C c 3 c c c ( Br ) c ( C ( F ) ( F ) F ) c 3 ) n c 2 ) C C 1


In [None]:
# load data into dataframes

In [None]:
line_count = !cat USPTO_480k/src-train.txt | wc -l
total = int(line_count[0])

In [None]:
total

409034

In [None]:
# ideally you would make sure that all SMILES are canonicalized but here we will skip this for time reasons and assume that all SMILES were already canonicalized
```
line_count = !cat USPTO_480k/src-train.txt | wc -l
total = int(line_count[0])
with open('USPTO_480k/src-train.txt', 'r') as f:
  precursors_train = [canonicalize_smiles(line.strip().replace(' ', '')) for line in tqdm(f, total=total)]
with open('USPTO_480k/tgt-train.txt', 'r') as f:
  products_train = [canonicalize_smiles(line.strip().replace(' ', '')) for line in tqdm(f, total=total)]
line_count = !cat USPTO_480k/src-val.txt | wc -l
total = int(line_count[0])
with open('USPTO_480k/src-val.txt', 'r') as f:
  precursors_val = [canonicalize_smiles(line.strip().replace(' ', '')) for line in tqdm(f, total=total)]
with open('USPTO_480k/tgt-val.txt', 'r') as f:
  products_val = [canonicalize_smiles(line.strip().replace(' ', '')) for line in tqdm(f, total=total)]
line_count = !cat USPTO_480k/src-test.txt | wc -l
total = int(line_count[0])
with open('USPTO_480k/src-test.txt', 'r') as f:
  precursors_test = [canonicalize_smiles(line.strip().replace(' ', '')) for line in tqdm(f, total=total)]
with open('USPTO_480k/tgt-test.txt', 'r') as f:
  products_test = [canonicalize_smiles(line.strip().replace(' ', '')) for line in tqdm(f, total=total)]
```

  2%|▏         | 9710/409034 [00:05<07:35, 877.42it/s]

KeyboardInterrupt: ignored

In [None]:
with open('USPTO_480k/src-train.txt', 'r') as f:
  precursors_train = [line.strip().replace(' ', '') for line in f]
with open('USPTO_480k/tgt-train.txt', 'r') as f:
  products_train = [line.strip().replace(' ', '') for line in f]
with open('USPTO_480k/src-val.txt', 'r') as f:
  precursors_val = [line.strip().replace(' ', '') for line in f]
with open('USPTO_480k/tgt-val.txt', 'r') as f:
  products_val = [line.strip().replace(' ', '') for line in f]
with open('USPTO_480k/src-test.txt', 'r') as f:
  precursors_test = [line.strip().replace(' ', '') for line in f]
with open('USPTO_480k/tgt-test.txt', 'r') as f:
  products_test = [line.strip().replace(' ', '') for line in f]

In [None]:
line_count = !cat USPTO_480k/src-val.txt | wc -l
total = int(line_count[0])
with open('USPTO_480k/src-val.txt', 'r') as f:
  can_precursors_val = [canonicalize_smiles(line.strip().replace(' ', '')) for line in tqdm(f, total=total)]

30000it [00:12, 2445.51it/s]                           


In [None]:
# we would indeed have another canonicalisation 
# there is no standard canonicalisation / 
for smiles, can_smiles in zip(precursors_val, can_precursors_val):
  try:
    assert smiles == can_smiles
  except AssertionError:
    print(smiles)
    print(can_smiles)
    break

C1COCCO1.COc1cc2sc3ccc(Br)cc3n3cc(Cc4cccnc4)c(=O)c(c1)c23.N#CC1=C(C#N)C(=O)C(Cl)=C(Cl)C1=O
C1COCCO1.COc1cc2c3c(c1)c(=O)c(Cc1cccnc1)cn3-c1cc(Br)ccc1S2.N#CC1=C(C#N)C(=O)C(Cl)=C(Cl)C1=O


'O=C1C(C#N)=C(C#N)C(=O)C(Cl)=C1Cl.C1COCCO1.COc1cc2c3c(c1)c(=O)c(Cc1cccnc1)cn3-c1cc(Br)ccc1S2'

In [None]:
train_df = pd.DataFrame({'precursors': precursors_train, 'products': products_train})
print(f"The training set contains {train_df.shape[0]} reactions.")
train_df.head()

In [None]:
val_df = pd.DataFrame({'precursors': precursors_val, 'products': products_val})
print(f"The validation set contains {val_df.shape[0]} reactions.")
val_df.head()

In [None]:
test_df = pd.DataFrame({'precursors': precursors_test, 'products': products_test})
print(f"The test set contains {test_df.shape[0]} reactions.")
test_df.head()

In [None]:
# What if now we wanted to do some data augmentation on the training set

def randomize_smiles(smiles, random_type="rotated"):
    """
    # https://github.com/rxn4chemistry/rxn_yields/blob/master/nbs/06_data_augmentation.ipynb
    Inspired from: https://github.com/undeadpixel/reinvent-randomized and https://github.com/GLambard/SMILES-X
    Returns a random SMILES given a SMILES of a molecule.
    :param mol: A Mol object
    :param random_type: The type (unrestricted, restricted, rotated) of randomization performed.
    :return : A random SMILES string of the same molecule or None if the molecule is invalid.
    """
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        print(f"{smiles} not valid.")
        return None

    if random_type == "unrestricted":
        return Chem.MolToSmiles(mol, canonical=False, doRandom=True, isomericSmiles=True)
    elif random_type == "restricted":
        new_atom_order = list(range(mol.GetNumAtoms()))
        random.shuffle(new_atom_order)
        random_mol = Chem.RenumberAtoms(mol, newOrder=new_atom_order)
        return Chem.MolToSmiles(random_mol, canonical=False, isomericSmiles=True)
    elif random_type == 'rotated':
        n_atoms = mol.GetNumAtoms()
        rotation_index = random.randint(0, n_atoms-1)
        atoms = list(range(n_atoms))
        new_atoms_order = (atoms[rotation_index%len(atoms):]+atoms[:rotation_index%len(atoms)])
        rotated_mol = Chem.RenumberAtoms(mol,new_atoms_order)
        return Chem.MolToSmiles(rotated_mol, canonical=False, isomericSmiles=True)
    raise ValueError("Type '{}' is not valid".format(random_type))

In [None]:
example_smi = 'O=C1C2=C(N=CN2C)N(C(=O)N1C)C'
mol = Chem.MolFromSmiles(example_smi)
print(f"The canonical SMILES of this caffeine molecule is: {Chem.MolToSmiles(mol)}")
mol

In [None]:
# different starting atom
rotated_random_smiles = []
for i in range (500):
    rotated_random_smiles.append(randomize_smiles(example_smi))
print(len(set(rotated_random_smiles)))
set(rotated_random_smiles)

In [None]:
restricted_random_smiles = []
for i in range (500):
    restricted_random_smiles.append(randomize_smiles(example_smi, 'restricted'))
print(len(set(restricted_random_smiles)))
list(set(restricted_random_smiles))[:5]

In [None]:
unrestricted_random_smiles = []
for i in range (10000):
    unrestricted_random_smiles.append(randomize_smiles(example_smi, random_type='unrestricted'))
print(len(set(unrestricted_random_smiles)))
list(set(unrestricted_random_smiles))[:5]

In [None]:
recanonicalised_smiles = set([Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) for smiles in unrestricted_random_smiles])
assert len(recanonicalised_smiles) == 1
recanonicalised_smiles

In [None]:
for i in range(5):
  print(randomize_smiles(can_smiles))

C1COCCO1.COc1cc2c3c(c1)c(=O)c(Cc1cccnc1)cn3-c1cc(Br)ccc1S2.N#CC1=C(C#N)C(=O)C(Cl)=C(Cl)C1=O
c1ncc(Cc2cn3c4c(cc(OC)cc4c2=O)Sc2c-3cc(Br)cc2)cc1.N#CC1=C(C#N)C(=O)C(Cl)=C(Cl)C1=O.C1COCCO1
C1(C#N)=C(C#N)C(=O)C(Cl)=C(Cl)C1=O.C1COCCO1.COc1cc2c3c(c1)c(=O)c(Cc1cccnc1)cn3-c1cc(Br)ccc1S2
c1cncc(Cc2cn3c4c(cc(OC)cc4c2=O)Sc2c-3cc(Br)cc2)c1.N#CC1=C(C#N)C(=O)C(Cl)=C(Cl)C1=O.C1COCCO1
n12c3c(cc(OC)cc3c(=O)c(Cc3cccnc3)c1)Sc1c-2cc(Br)cc1.N#CC1=C(C#N)C(=O)C(Cl)=C(Cl)C1=O.C1COCCO1


In [None]:
# we will include a rotated copy of all the training reactions

rotated_train_precursors = [randomize_smiles(precursors) for precursors in tqdm(train_df.precursors)]

In [None]:
rotated_train_df = pd.DataFrame({'precursors': rotated_train_precursors, 'products': products_train})
total_train_df = pd.concat([train_df, rotated_train_df])
total_train_df.shape


In [None]:
# To be able to train a language model, we need to split the strings into tokens

# We take the regex pattern introduced in the [Molecular Transformer](https://pubs.acs.org/doi/abs/10.1021/acscentsci.9b00576).
SMI_REGEX_PATTERN =  r"(\%\([0-9]{3}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"

def smiles_tokenizer(smiles):
  smiles_regex = re.compile(SMI_REGEX_PATTERN)
  tokens = [token for token in smiles_regex.findall(smiles)]
  return ' '.join(tokens)

In [None]:
# remember to shuffle your training data :)

shuffled_total_train_df = total_train_df.sample(frac=1., random_state=42)