In [1]:
import os
import sys
if os.path.abspath('../src') not in sys.path:
    sys.path.append(os.path.abspath('../src'))
import pandas as pd

In [2]:
import torch
import pickle
import numpy as np
import matplotlib.pyplot as plt
from model import GNN
from easydict import EasyDict
import time
from tqdm.auto import tqdm
from collections import defaultdict
import pysmiles
import pandas as pd
import dgl
from dgl.dataloading import GraphDataLoader
from property_pred.pp_data_processing import PropertyPredDataset
from data_processing import SmilesDataset, preprocess, get_feature_encoder, networkx_to_dgl

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# args = EasyDict({'pretrained_model': 'tag_1024', 'batch_size': 1024, 'gpu': '0', 'dataset': 'BBBP'})
# data = PropertyPredDataset(args)
args = EasyDict({'pretrained_model': 'tag_1024', 'batch_size': 1024, 'gpu': '0', 'dataset': 'USPTO-479k'})

In [22]:
# feature_encoder, train_graphs, valid_graphs, test_graphs = preprocess(args.dataset)
# train_dataset = SmilesDataset(args, 'train', feature_encoder, train_graphs)

FileNotFoundError: [Errno 2] No such file or directory: '../data/BBBP/cache/feature_encoder.pkl'

In [None]:

path = '../data/' + args.dataset + '/cache/feature_encoder.pkl'
with open(path, 'rb') as f:
    feature_encoder = pickle.load(f)
train_dataset = SmilesDataset(args, 'train')

In [None]:
path = '../saved/' + args.pretrained_model + '/'
print('loading hyperparameters of pretrained model from ' + path + 'hparams.pkl')
with open(path + 'hparams.pkl', 'rb') as f:
    hparams = pickle.load(f)

In [None]:
print('loading pretrained model from ' + path + 'model.pt')
mole = GNN(hparams['gnn'], hparams['layer'], hparams['feature_len'], hparams['dim'])
if torch.cuda.is_available():
    mole.load_state_dict(torch.load(path + 'model.pt', map_location=torch.device('cpu'), weights_only=True))
    mole = mole.cuda('cuda:'+args.gpu)
else:
    mole.load_state_dict(torch.load(path + 'model.pt', map_location=torch.device('cpu'), weights_only=True))

In [13]:
def time_vs_batch_size(data, model, batch_sizes: list[int] = [1024]) -> pd.DataFrame:
    mean_times = []
    for batch_size in tqdm(batch_sizes, leave=False, desc="batch"):
        dataloader = GraphDataLoader(data, batch_size=batch_size)
        with torch.no_grad():
            model.eval()
            times = []
            for graphs_batch, _ in tqdm(dataloader):
                start_time = time.time()
                _ = model(graphs_batch)
                end_time = time.time()
                times.append(end_time - start_time)
            mean_time = np.mean(times)
        mean_times.append(mean_time)
    return pd.DataFrame({'time': mean_times, 'batch_size': batch_sizes})

In [None]:
time_vs_batch = time_vs_batch_size([128, 256, 512, 1024, 2048, 4096, 8192, 16384])

In [8]:
time_vs_batch['time_per_mol'] = time_vs_batch.time / time_vs_batch.batch_size

In [None]:
fix, ax = plt.subplots(1, 1, figsize=(7, 4))
ax.plot(time_vs_batch.batch_size, time_vs_batch.time_per_mol)
ax.set_xlabel('batch_size')
ax.set_xscale('log')
ax.set_ylabel('time per molecule, s')

time_vs_batch['time_per_mol'].tail(1)*1000000000

In [11]:
from pymilvus import MilvusClient
client = MilvusClient("./milvus_demo.db")

In [12]:
client.create_collection(
    collection_name="chembl",
    dimension=1024  # The vectors we will use in this demo has 384 dimensions
)

In [9]:
DATA_PATH = '../../data/wee2/all.csv'
raw_data = pd.read_csv(DATA_PATH)[:10000]

In [10]:
def smiles_to_graphs(smiles_list):
    attribute_names = ['element', 'charge', 'aromatic', 'hcount']

    # saving all possible values of each attribute (only for training data)
    all_values = defaultdict(set)
    graphs = []

    for smiles in smiles_list:
        # pysmiles.read_smiles() will raise a ValueError: "The atom [se] is malformatted" on USPTO-479k dataset.
        # This is because "Se" is in a aromatic ring, so in USPTO-479k, "Se" is transformed to "se" to satisfy
        # SMILES rules. But pysmiles does not treat "se" as a valid atom and raise a ValueError. To handle this
        # case, I transform all "se" to "Se" in USPTO-479k.
        smiles = smiles.replace('[se]', '[Se]')

        # use pysmiles.read_smiles() to parse SMILES and get graph objects (in networkx format)
        graph = pysmiles.read_smiles(smiles, zero_order_bonds=False)

        for attr in attribute_names:
            for _, value in graph.nodes(data=attr):
                all_values[attr].add(value)

        graphs.append(graph)

    return all_values, graphs

all_values, graphs = smiles_to_graphs(raw_data['smiles'].to_list())
feature_encoder = get_feature_encoder(all_values)

class InMemoryDataset(dgl.data.DGLDataset):
    def __init__(self, feature_encoder, raw_graphs, path='.', gpu=0):
        self.feature_encoder = feature_encoder
        self.raw_graphs = raw_graphs
        self.path = path
        self.gpu = gpu
        self.graphs = []
        self.product_graphs = []
        super().__init__(name='Smiles_data')

    def to_gpu(self):
        if torch.cuda.is_available():
            print('moving data to GPU')
            self.graphs = [graph.to('cuda:' + str(self.gpu)) for graph in self.graphs]

    def save(self):
        print('saving data to ' + self.path + '/' + 'graphs.bin')
        dgl.save_graphs(self.path + 'graphs.bin', self.graphs)

    def load(self):
        print('loading graphs from ' + self.path + '/' + 'graphs.bin')
        # graphs loaded from disk will have a default empty label set: [graphs, labels], so we only take the first item
        self.graphs = dgl.load_graphs(self.path + '/' + 'graphs.bin')[0]
        self.to_gpu()

    def process(self):
        print('transforming data from networkx graphs to DGL graphs')
        for i, raw_graph in enumerate(self.raw_graphs):
            if i % 10000 == 0:
                print('%dk' % (i // 1000))
            # transform networkx graphs to dgl graphs
            graph = networkx_to_dgl(raw_graph, self.feature_encoder)
            self.graphs.append(graph)
        self.to_gpu()

    def has_cache(self):
        return os.path.exists(self.path + '/' + 'graphs.bin')

    def __getitem__(self, i):
        return self.graphs[i]

    def __len__(self):
        return len(self.graphs)

data = InMemoryDataset(feature_encoder, graphs)


Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical informa

transforming data from networkx graphs to DGL graphs
0k
moving data to GPU
saving data to ./graphs.bin


In [11]:
path = '../saved/' + 'tag_1024' + '/'
print('loading hyperparameters of pretrained model from ' + path + 'hparams.pkl')
with open(path + 'hparams.pkl', 'rb') as f:
    hparams = pickle.load(f)
print('loading pretrained model from ' + path + 'model.pt')

model = GNN(hparams['gnn'], hparams['layer'], hparams['feature_len'], hparams['dim'])
if torch.cuda.is_available():
    model.load_state_dict(torch.load(path + 'model.pt', map_location=torch.device('cpu'), weights_only=True))
    model = model.cuda('cuda:'+ '0')
else:
    model.load_state_dict(torch.load(path + 'model.pt', map_location=torch.device('cpu'), weights_only=True))

loading hyperparameters of pretrained model from ../saved/tag_1024/hparams.pkl
loading pretrained model from ../saved/tag_1024/model.pt


In [14]:
results = time_vs_batch_size(data, model, batch_sizes = [1024, 2048])

  0%|          | 0/10 [00:00<?, ?it/s]?it/s]
                                            

DGLError: Invalid key "0". Must be one of the edge types.

defaultdict(set,
            {'element': {'C', 'Cl', 'F', 'N', 'O', 'S'},
             'charge': {0},
             'aromatic': {False, True},
             'hcount': {0, 1, 2, 3}})