In [2]:
import joblib, argparse, uuid, sigopt
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs

from sklearn import preprocessing
from utils.sklearn_utils import *
from utils.selfies_util import *

import selfies as sf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,regularizers

import seaborn as sns

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)    

In [1]:
#from spektral.layers import GraphAttention, GlobalAttentionPool
from spektral.data import BatchLoader, Graph, Dataset, Loader, utils, DisjointLoader, MixedLoader, SingleLoader
from spektral.utils import label_to_one_hot, load_sdf, load_csv
from spektral.layers.ops import sp_matrix_to_sp_tensor

from spektral.datasets import QM9
from spektral.data import Dataset, Graph
from spektral.utils import label_to_one_hot, sparse
from spektral.layers import AGNNConv, ECCConv, GlobalSumPool, GATConv, GeneralConv, GlobalAttentionPool
from spektral.models import GeneralGNN, GCN
import os
import numpy as np
import matplotlib.pyplot as plt 

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Input



from tqdm import tqdm
from joblib import Parallel, delayed
from tensorflow.keras.utils import get_file 
from rdkit.Chem import PandasTools, SDMolSupplier, Descriptors
from sklearn.preprocessing import StandardScaler, MinMaxScaler, scale
from sklearn.metrics import r2_score

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Traceback (most recent call last):
  File "/home/santiagovargas/anaconda3/envs/tf_gpu/lib/python3.7/site-packages/rdkit/Chem/PandasTools.py", line 131, in <module>
    if 'display.width' in pd.core.config._registered_options:
AttributeError: module 'pandas.core' has no attribute 'config'


In [None]:
names, ret, homo, homo1, diff = sdf()
ATOM_TYPES = [1, 6, 7, 8, 9, 17, 35]
BOND_TYPES = [1, 2, 3]

def atom_to_feature(atom):
    
        
    atomic_num = label_to_one_hot(atom["atomic_num"], ATOM_TYPES)
    coords = atom["coords"]
    charge = atom["charge"]
    iso = atom["iso"]
    return np.concatenate((atomic_num, coords, [charge, iso]), -1)

def mol_to_adj(mol):
    
    row, col, edge_features = [], [], []
    for bond in mol["bonds"]:
        start, end = bond["start_atom"], bond["end_atom"]
        row += [start, end]
        col += [end, start]
        edge_features += [bond["type"]] * 2

    a, e = sparse.edge_index_to_matrix(
        edge_index=np.array((row, col)).T,
        edge_weight=np.ones_like(row),
        edge_features=label_to_one_hot(edge_features, BOND_TYPES),
    )
    return a, e

def read_mol(mol):
    x = np.array([atom_to_feature(atom) for atom in mol["atoms"]])
    a, e = mol_to_adj(mol)
    return x, a, e



def parse_sdf(sdf):
    #print(sdf)
    sdf_out = {}
    sdf = sdf.split("\n")
    sdf_out["name"], sdf_out["details"], sdf_out["comment"] = _parse_header(sdf)
    sdf_out["n_atoms"], sdf_out["n_bonds"] = _parse_counts_line(sdf)
    sdf_out["atoms"] = _parse_atoms_block(sdf, sdf_out["n_atoms"])
    sdf_out["bonds"] = _parse_bonds_block(sdf, sdf_out["n_atoms"], sdf_out["n_bonds"])
    sdf_out["properties"] = _parse_properties(
        sdf, sdf_out["n_atoms"], sdf_out["n_bonds"]
    )
    sdf_out["data"] = _parse_data_fields(sdf)
    return sdf_out

def parse_sdf_file(sdf_file, amount=None):
    data = sdf_file.read().split("$$$$\n")
    if data[-1] == "":
        data = data[:-1]
    if amount is not None:
        data = data[:amount]
    output = [parse_sdf(sdf) for sdf in data]  # Parallel execution doesn't help
    return output

from spektral.utils.io import *

def load_sdf(filename, amount=None):
    """
    Load an .sdf file and return a list of molecules in the internal SDF format.
    :param filename: target SDF file
    :param amount: only load the first `amount` molecules from the file
    :return: a list of molecules in the internal SDF format (see documentation).
    """
    #print("Reading SDF")
    with open(filename) as f:
        return parse_sdf_file(f, amount=amount)



class dataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def read(self):
        
        names, ret, homo, homo1, diff = sdf()
        mean = np.mean(diff)
        std = np.std(diff)
        diff_scale = (diff - mean) / std
        
        data = [parse_sdf(i) for i in ret]
        data = Parallel(n_jobs=-1)(delayed(read_mol)(mol) for mol in tqdm(data, ncols=80))
        x_list, a_list, e_list = list(zip(*data))
        
        dataset = [Graph(x=x, a=a, e=e, y = y) for x, a, e, y 
                   in zip(x_list, a_list, e_list, diff_scale)]

        return dataset

graph_dataset = dataset()

NameError: name 'data' is not defined

In [2]:
dataset = graph_dataset
################################################################################
# PARAMETERS
################################################################################
learning_rate = 1e-3  # Learning rate
epochs = 20  # Number of training epochs
batch_size = 1 # Batch size
################################################################################
# LOAD DATA
################################################################################
# Train/test split
idxs = np.random.permutation(len(dataset))
split = int(0.8 * len(dataset))
idx_tr, idx_te = np.split(idxs, [split])
idx_tr = [int(i) for i in idx_tr]
idx_te = [int(i) for i in idx_te]
dataset_train = dataset[idx_tr]  
dataset_test = dataset[idx_te] 
steps_per_epoch = len(dataset_tr) /  batch_size


"""
# Parameters
F = dataset.n_node_features  # Dimension of node features
S = dataset.n_edge_features  # Dimension of edge features
n_out = dataset.n_labels     # Dimension of the target

X_in = Input(shape=(None, F))
A_in = Input(shape=(None, None))
E_in = Input(shape=(None, None, S))

X_1 = ECCConv(256, activation="relu")([X_in, A_in, E_in])
X_2 = ECCConv(256, activation="relu")([X_1, A_in, E_in])
X_3 = GlobalSumPool()(X_2)
output = Dense(n_out)(X_3)

model = Model(inputs=[X_in, A_in, E_in], outputs=output)


"""
#encode = GeneralConv(256, dropout=0.2)([])
#encode = Dense(n_out)(encode)
#encode = GeneralConv(256, dropout=0.2)([])
                      

GeneralConv(256, dropout=0.2)()


n_out = 1

steps_per_epoch = len(dataset) /  batch_size
loader = BatchLoader(dataset, epochs = epochs, batch_size = batch_size)

steps_per_epoch = len(dataset_tr) /  batch_size
loader_train = BatchLoader(dataset_train, epochs = epochs, batch_size = batch_size)

steps_per_epoch = len(dataset_te) /  batch_size
loader_test = BatchLoader(dataset_test, batch_size = batch_size)

NameError: name 'graph_dataset' is not defined

In [3]:
F = 8 #dataset.n_node_features  # Dimension of node features
S = 5 #dataset.n_edge_features  # Dimension of edge features
n_out = 1     # Dimension of the target

X_in = Input(shape=(None, F))
A_in = Input(shape=(None, None))
E_in = Input(shape=(None, None, S))

X_1 = ECCConv(256, activation="relu")([X_in, A_in, E_in])
X_2 = ECCConv(256, activation="relu")([X_1, A_in, E_in])
output = Dense(n_out)(X_2)

model = Model(inputs=[X_in, A_in, E_in], outputs=output)
