# MolGAN: An implicit generative model for small molecular graphs

ABSTRACT: Deep generative models for graph-structured data
offer a new angle on the problem of chemical
synthesis: by optimizing differentiable models
that directly generate molecular graphs, it is possible to side-step expensive search procedures in
the discrete and vast space of chemical structures.
We introduce MolGAN, an implicit, likelihoodfree generative model for small molecular graphs
that circumvents the need for expensive graph
matching procedures or node ordering heuristics of previous likelihood-based methods. Our
method adapts generative adversarial networks
(GANs) to operate directly on graph-structured
data. We combine our approach with a reinforcement learning objective to encourage the generation of molecules with specific desired chemical
properties. In experiments on the QM9 chemical database, we demonstrate that our model is
capable of generating close to 100% valid compounds. MolGAN compares favorably both to
recent proposals that use string-based (SMILES)
representations of molecules and to a likelihood-based method that directly generates graphs, albeit being susceptible to mode collapse.

Link to paper: https://arxiv.org/pdf/1805.11973.pdf

Credit: https://github.com/nicola-decao/MolGAN

Google Colab: https://colab.research.google.com/drive/1ERjtWDQa5lEF8a_JY-gvXAvCn7h09lat?usp=sharing

In [1]:
# Clone the repository and cd into directory 
!git clone https://github.com/nicola-decao/MolGAN.git
%cd MolGAN

/content/MolGAN


In [None]:
# Download the dataset used for paper
!./data/download_dataset.sh

In [7]:
# Move files into the data folder
!mv NP_score.pkl.gz data/
!mv SA_score.pkl.gz data/

!mv gdb9.sdf data/
!mv gdb9.sdf.csv data/ 

In [None]:
# Convert dataset in a graph format used by MolGAN models
!python ./utils/sparse_molecular_dataset.py

In [None]:
# Install RDKit
!pip install rdkit-pypi==2021.3.1.5

In [2]:
import tensorflow as tf

from utils.sparse_molecular_dataset import SparseMolecularDataset
from utils.trainer import Trainer
from utils.utils import *

from models.gan import GraphGANModel
from models import encoder_rgcn, decoder_adj, decoder_dot, decoder_rnn

from optimizers.gan import GraphGANOptimizer

In [3]:
batch_dim = 128
la = 1
dropout = 0
n_critic = 5
metric = 'validity,sas'
n_samples = 5000
z_dim = 8
epochs = 10
save_every = 1 # May lead to errors if left as None

data = SparseMolecularDataset()
data.load('data/gdb9_9nodes.sparsedataset')

steps = (len(data) // batch_dim)

In [4]:
def train_fetch_dict(i, steps, epoch, epochs, min_epochs, model, optimizer):
    a = [optimizer.train_step_G] if i % n_critic == 0 else [optimizer.train_step_D]
    b = [optimizer.train_step_V] if i % n_critic == 0 and la < 1 else []
    return a + b

def train_feed_dict(i, steps, epoch, epochs, min_epochs, model, optimizer, batch_dim):
    mols, _, _, a, x, _, _, _, _ = data.next_train_batch(batch_dim)
    embeddings = model.sample_z(batch_dim)

    if la < 1:

        if i % n_critic == 0:
            rewardR = reward(mols)

            n, e = session.run([model.nodes_gumbel_argmax, model.edges_gumbel_argmax],
                               feed_dict={model.training: False, model.embeddings: embeddings})
            n, e = np.argmax(n, axis=-1), np.argmax(e, axis=-1)
            mols = [data.matrices2mol(n_, e_, strict=True) for n_, e_ in zip(n, e)]

            rewardF = reward(mols)

            feed_dict = {model.edges_labels: a,
                         model.nodes_labels: x,
                         model.embeddings: embeddings,
                         model.rewardR: rewardR,
                         model.rewardF: rewardF,
                         model.training: True,
                         model.dropout_rate: dropout,
                         optimizer.la: la if epoch > 0 else 1.0}

        else:
            feed_dict = {model.edges_labels: a,
                         model.nodes_labels: x,
                         model.embeddings: embeddings,
                         model.training: True,
                         model.dropout_rate: dropout,
                         optimizer.la: la if epoch > 0 else 1.0}
    else:
        feed_dict = {model.edges_labels: a,
                     model.nodes_labels: x,
                     model.embeddings: embeddings,
                     model.training: True,
                     model.dropout_rate: dropout,
                     optimizer.la: 1.0}

    return feed_dict

In [5]:
def eval_fetch_dict(i, epochs, min_epochs, model, optimizer):
    return {'loss D': optimizer.loss_D, 'loss G': optimizer.loss_G,
            'loss RL': optimizer.loss_RL, 'loss V': optimizer.loss_V,
            'la': optimizer.la}

def eval_feed_dict(i, epochs, min_epochs, model, optimizer, batch_dim):
    mols, _, _, a, x, _, _, _, _ = data.next_validation_batch()
    embeddings = model.sample_z(a.shape[0])

    rewardR = reward(mols)

    n, e = session.run([model.nodes_gumbel_argmax, model.edges_gumbel_argmax],
                       feed_dict={model.training: False, model.embeddings: embeddings})
    n, e = np.argmax(n, axis=-1), np.argmax(e, axis=-1)
    mols = [data.matrices2mol(n_, e_, strict=True) for n_, e_ in zip(n, e)]

    rewardF = reward(mols)

    feed_dict = {model.edges_labels: a,
                 model.nodes_labels: x,
                 model.embeddings: embeddings,
                 model.rewardR: rewardR,
                 model.rewardF: rewardF,
                 model.training: False}
                 
    return feed_dict

In [6]:
def test_fetch_dict(model, optimizer):
    return {'loss D': optimizer.loss_D, 'loss G': optimizer.loss_G,
            'loss RL': optimizer.loss_RL, 'loss V': optimizer.loss_V,
            'la': optimizer.la}

def test_feed_dict(model, optimizer, batch_dim):
    mols, _, _, a, x, _, _, _, _ = data.next_test_batch()
    embeddings = model.sample_z(a.shape[0])

    rewardR = reward(mols)

    n, e = session.run([model.nodes_gumbel_argmax, model.edges_gumbel_argmax],
                       feed_dict={model.training: False, model.embeddings: embeddings})
    n, e = np.argmax(n, axis=-1), np.argmax(e, axis=-1)
    mols = [data.matrices2mol(n_, e_, strict=True) for n_, e_ in zip(n, e)]

    rewardF = reward(mols)

    feed_dict = {model.edges_labels: a,
                 model.nodes_labels: x,
                 model.embeddings: embeddings,
                 model.rewardR: rewardR,
                 model.rewardF: rewardF,
                 model.training: False}
                 
    return feed_dict

In [7]:
def reward(mols):
    rr = 1.
    for m in ('logp,sas,qed,unique' if metric == 'all' else metric).split(','):

        if m == 'np':
            rr *= MolecularMetrics.natural_product_scores(mols, norm=True)
        elif m == 'logp':
            rr *= MolecularMetrics.water_octanol_partition_coefficient_scores(mols, norm=True)
        elif m == 'sas':
            rr *= MolecularMetrics.synthetic_accessibility_score_scores(mols, norm=True)
        elif m == 'qed':
            rr *= MolecularMetrics.quantitative_estimation_druglikeness_scores(mols, norm=True)
        elif m == 'novelty':
            rr *= MolecularMetrics.novel_scores(mols, data)
        elif m == 'dc':
            rr *= MolecularMetrics.drugcandidate_scores(mols, data)
        elif m == 'unique':
            rr *= MolecularMetrics.unique_scores(mols)
        elif m == 'diversity':
            rr *= MolecularMetrics.diversity_scores(mols, data)
        elif m == 'validity':
            rr *= MolecularMetrics.valid_scores(mols)
        else:
            raise RuntimeError('{} is not defined as a metric'.format(m))

    return rr.reshape(-1, 1)

In [8]:
def _eval_update(i, epochs, min_epochs, model, optimizer, batch_dim, eval_batch):
    mols = samples(data, model, session, model.sample_z(n_samples), sample=True)
    m0, m1 = all_scores(mols, data, norm=True)
    m0 = {k: np.array(v)[np.nonzero(v)].mean() for k, v in m0.items()}
    m0.update(m1)
    return m0

def _test_update(model, optimizer, batch_dim, test_batch):
    mols = samples(data, model, session, model.sample_z(n_samples), sample=True)
    m0, m1 = all_scores(mols, data, norm=True)
    m0 = {k: np.array(v)[np.nonzero(v)].mean() for k, v in m0.items()}
    m0.update(m1)
    return m0

In [None]:
# Roll back to older version of TensorFlow
!pip install tensorflow==1.15

In [10]:
# model
model = GraphGANModel(data.vertexes,
                      data.bond_num_types,
                      data.atom_num_types,
                      z_dim,
                      decoder_units=(128, 256, 512),
                      discriminator_units=((128, 64), 128, (128, 64)),
                      decoder=decoder_adj,
                      discriminator=encoder_rgcn,
                      soft_gumbel_softmax=False,
                      hard_gumbel_softmax=False,
                      batch_discriminator=False)

# optimizer
optimizer = GraphGANOptimizer(model, learning_rate=1e-3, feature_matching=False)

# session
session = tf.Session()
session.run(tf.global_variables_initializer())

# trainer
trainer = Trainer(model, optimizer, session)

print('Parameters: {}'.format(np.sum([np.prod(e.shape) for e in session.run(tf.trainable_variables())])))




Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.dropout instead.







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Parameters: 575556


In [14]:
# Create a saved_models folder
!mkdir MolGAN/saved_models

In [17]:
trainer.train(batch_dim=batch_dim,
              epochs=epochs,
              steps=steps,
              train_fetch_dict=train_fetch_dict,
              train_feed_dict=train_feed_dict,
              eval_fetch_dict=eval_fetch_dict,
              eval_feed_dict=eval_feed_dict,
              test_fetch_dict=test_fetch_dict,
              test_feed_dict=test_feed_dict,
              save_every=save_every,
              directory='saved_models', # here users need to first create and then specify a folder where to save the model
              _eval_update=_eval_update,
              _test_update=_test_update)

2021-05-18 23:08:28 Epochs          0/10 in 0:00:00 (last epoch in 0:00:00), ETA: -:--:-


  return MolecularMetrics.novel_scores(MolecularMetrics.valid_filter(mols), data).mean()
  ret = ret.dtype.type(ret / rcount)
  after removing the cwd from sys.path.


2021-05-18 23:09:02 Validation --> {'NP score': nan,
 'QED score': nan,
 'SA score': nan,
 'diversity score': nan,
 'drugcandidate score': 0.11783951472333935,
 'la': 1.0,
 'logP score': nan,
 'loss D': -1.1388904,
 'loss G': 1.1496494,
 'loss RL': -0.8075736,
 'loss V': 0.75656223,
 'novel score': nan,
 'unique score': 0,
 'valid score': 0.0}
2021-05-18 23:09:04 Model saved in saved_models!
 100% (1040/1040) [################################################################################] ETA: 0:00:00
2021-05-18 23:09:21 Epochs          1/10 in 0:00:52 (last epoch in 0:00:17), ETA: 0:07:52
2021-05-18 23:09:56 Validation --> {'NP score': 0.8877842135154173,
 'QED score': 0.563191330854385,
 'SA score': 0.4661472380122662,
 'diversity score': 0.7769213300011615,
 'drugcandidate score': 0.13984802858869488,
 'la': 1.0,
 'logP score': 0.3655359621306015,
 'loss D': -66.68398,
 'loss G': 30.091413,
 'loss RL': -0.46301308,
 'loss V': 0.31661555,
 'novel score': 100.0,
 'unique score': 23.