In [1]:
! pip install selfies tensorflow_probability

Collecting selfies
  Using cached selfies-2.1.1-py3-none-any.whl (35 kB)
Collecting tensorflow_probability
  Downloading tensorflow_probability-0.19.0-py2.py3-none-any.whl (6.7 MB)
     ---------------------------------------- 6.7/6.7 MB 15.9 MB/s eta 0:00:00
Collecting dm-tree
  Downloading dm_tree-0.1.8-cp310-cp310-win_amd64.whl (101 kB)
     -------------------------------------- 101.3/101.3 kB 5.7 MB/s eta 0:00:00
Installing collected packages: dm-tree, tensorflow_probability, selfies
Successfully installed dm-tree-0.1.8 selfies-2.1.1 tensorflow_probability-0.19.0


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os

import deepchem as dc
from deepchem.models.normalizing_flows import NormalizingFlow, NormalizingFlowModel
from deepchem.models.optimizers import Adam
from deepchem.data import NumpyDataset
from deepchem.splits import RandomSplitter
from deepchem.molnet import load_qm9

import selfies as sf

import rdkit
from rdkit import Chem
from rdkit.Chem import Draw

from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp

tdf = tfp.distributions
tfb = tfp.bijectors
tfk = tf.keras

tfk.backend.set_floatx('float64')

In [5]:
tasks, datasets, transformers = load_qm9(featurizer="ECFP")
df = pd.DataFrame(data={"smiles" : datasets[0].ids})

df.head()

Unnamed: 0,smiles
0,[H]O[C@@]1(C([H])([H])[H])C(=O)N(C([H])([H])[H...
1,[H]C([H])([H])C([H])([H])C([H])([H])[C@@]1([H]...
2,[H]C1=C([H])[C@]2([H])[N@H+]3C([H])([H])[C@]2(...
3,[H]C(=O)C([H])([H])[C@]1([H])[N@H+](C2([H])C([...
4,[H]OC([H])([H])C1([C@@]2([H])C([H])([H])[C@@]2...


In [6]:
sf.set_semantic_constraints() # reset constraints
constraints = sf.get_semantic_constraints()
constraints["?"] = 3

sf.set_semantic_constraints(constraints)
constraints

{'H': 1,
 'F': 1,
 'Cl': 1,
 'Br': 1,
 'I': 1,
 'B': 3,
 'B+1': 2,
 'B-1': 4,
 'O': 2,
 'O+1': 3,
 'O-1': 1,
 'N': 3,
 'N+1': 4,
 'N-1': 2,
 'C': 4,
 'C+1': 5,
 'C-1': 3,
 'P': 5,
 'P+1': 6,
 'P-1': 4,
 'S': 6,
 'S+1': 7,
 'S-1': 5,
 '?': 3}

In [7]:
def process_smiles(smiles):
    return sf.encoder(smiles)

def keys_int(symbol_to_int):
    d = {}
    for i, key in enumerate(symbol_to_int.keys()):
        d[i] = key
    return d

df["selfies"] = df["smiles"].apply(process_smiles)

df["len"] = df["smiles"].apply(lambda x: len(x))
df.sort_values(by="len").head()

Unnamed: 0,smiles,selfies,len
13309,[H]C#N,[H][C][#N],6
18573,[H]O[H],[H][O][H],7
53523,n1nnon1,[N][N][=N][O][N][=Ring1][Branch1],7
5210,[H]C#C[H],[H][C][#C][H],9
58883,N#CC#CC#N,[N][#C][C][#C][C][#N],9
