In [1]:
import deepchem as dc
import torch
import sklearn
import sklearn.ensemble

In [2]:
loader = dc.data.CSVLoader(["JCHEM_PKA"], feature_field="SMILES", featurizer=dc.feat.CircularFingerprint(1024))
dataset = loader.create_dataset("dwar_pka1.csv")

RDKit ERROR: [15:01:26] Explicit valence for atom # 2 O, 3, is greater than permitted
Failed to featurize datapoint 119, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
RDKit ERROR: [15:01:28] Explicit valence for atom # 0 N, 4, is greater than permitted
Failed to featurize datapoint 413, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
RDKit ERROR: [15:01:36] Explicit valence for atom # 13 Cl, 5, is greater than permitted
Failed to featurize datapoint 1697, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.r

In [93]:
splitter = dc.splits.ScaffoldSplitter()
(train, valid, test) = splitter.train_valid_test_split(dataset=dataset)
print(train)

<DiskDataset X.shape: (3601, 2048), y.shape: (3601, 1), w.shape: (3601, 1), task_names: ['JCHEM_PKA']>


In [71]:
##Linear net with circular fingerprints
linear_net = torch.nn.Sequential(
    torch.nn.Linear(2048, 2048),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.25),
    torch.nn.Linear(2048, 512),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.25),
    torch.nn.Linear(512, 1),
)

model1 = dc.models.TorchModel(linear_net, dc.models.losses.L2Loss())

model1.fit(train, nb_epoch=50)

metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', model1.evaluate(train, [metric]))
print('test set score:', model1.evaluate(test, [metric]))

2.363328857421875

In [72]:
metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', model1.evaluate(train, [metric]))
print('test set score:', model1.evaluate(test, [metric]))

training set score: {'mean_squared_error': 2.290657878126272}
test set score: {'mean_squared_error': 39.433958518683355}


In [75]:
##Random forest with circular fingerprints
sklearn_model = sklearn.ensemble.RandomForestRegressor(n)
rf_model = dc.models.SklearnModel(sklearn_model)

rf_model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', rf_model.evaluate(train, [metric]))
print('test set score:', rf_model.evaluate(test, [metric]))

training set score: {'mean_squared_error': 4.424217812515738}
test set score: {'mean_squared_error': 27.612937567321644}


In [100]:
##Random forest with circular fingerprints
sklearn_model = sklearn.ensemble.RandomForestRegressor(n_estimators=500)
rf_model = dc.models.SklearnModel(sklearn_model)

rf_model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', rf_model.evaluate(train, [metric]))
print('test set score:', rf_model.evaluate(test, [metric]))

training set score: {'mean_squared_error': 4.3255389387971075}
test set score: {'mean_squared_error': 27.383245225154788}


In [99]:
##Gradient boost with circular fingerprints
sklearn_model = sklearn.ensemble.GradientBoostingRegressor(n_estimators=1000)
gboost_model = dc.models.SklearnModel(sklearn_model)

gboost_model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', gboost_model.evaluate(train, [metric]))
print('test set score:', gboost_model.evaluate(test, [metric]))

training set score: {'mean_squared_error': 8.585048931240335}
test set score: {'mean_squared_error': 62.47056628527392}


In [79]:
## mol2vec
loader = dc.data.CSVLoader(["JCHEM_PKA"], feature_field="SMILES", featurizer=dc.feat.Mol2VecFingerprint)
dataset = loader.create_dataset("dwar_pka1.csv")

splitter = dc.splits.RandomSplitter()
(train, valid, test) = splitter.train_test_split(dataset=dataset)
print(train)

sklearn_model = sklearn.ensemble.RandomForestRegressor()
rf_model = dc.models.SklearnModel(sklearn_model)

rf_model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', rf_model.evaluate(train, [metric]))
print('test set score:', rf_model.evaluate(test, [metric]))

AttributeError: 'Series' object has no attribute 'endswith'

In [85]:
## MPNN
loader = dc.data.CSVLoader(["JCHEM_PKA"], feature_field="SMILES", featurizer=dc.feat.WeaveFeaturizer())
dataset = loader.create_dataset("dwar_pka1.csv")

splitter = dc.splits.RandomSplitter()
(train, test) = splitter.train_test_split(dataset=dataset)

RDKit ERROR: [12:15:05] Explicit valence for atom # 2 O, 3, is greater than permitted
Failed to featurize datapoint 119, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
RDKit ERROR: [12:15:09] Explicit valence for atom # 0 N, 4, is greater than permitted
Failed to featurize datapoint 413, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
RDKit ERROR: [12:19:16] Explicit valence for atom # 13 Cl, 5, is greater than permitted
Failed to featurize datapoint 1697, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.r

In [88]:
print(train)

<DiskDataset X.shape: (3601,), y.shape: (3601, 1), w.shape: (3601, 1), task_names: ['JCHEM_PKA']>


In [89]:
mpnn_model = dc.models.MPNNModel(n_tasks=1, n_pair_feat=14, n_atom_feat=75, n_hidden=75,
                                T=1, M=1)

mpnn_model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', mpnn_model.evaluate(train, [metric]))
print('test set score:', mpnn_model.evaluate(test, [metric]))

ValueError: cannot reshape array of size 16646 into shape (1849,14)

In [84]:
## chemception
loader = dc.data.CSVLoader(["JCHEM_PKA"], feature_field="SMILES", featurizer=dc.feat.SmilesToImage())
dataset = loader.create_dataset("dwar_pka1.csv")

splitter = dc.splits.RandomSplitter()
(train, test) = splitter.train_test_split(dataset=dataset)

chemception_model = dc.models.ChemCeption(n_tasks=1)

chemception_model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', chemception_model.evaluate(train, [metric]))
print('test set score:', chemception_model.evaluate(test, [metric]))

RDKit ERROR: [11:59:39] Explicit valence for atom # 2 O, 3, is greater than permitted
Failed to featurize datapoint 119, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
Failed to featurize datapoint 207, C[C@@H]1NC(=O)[C@@H](N)CNC(=O)[C@H]([C@H]2CCN=C(N)N2)NC(=O)/C(=C\NC(N)=O)NC(=O)[C@H](CNC(=O)C[C@@H](N)CCCN)NC1=O.NCCC[C@H](N)CC(=O)NC[C@@H]1NC(=O)[C@H](CO)NC(=O)[C@@H](N)CNC(=O)[C@H]([C@H]2CCN=C(N)N2)NC(=O)/C(=C\NC(N)=O)NC1=O. Appending empty array
Exception message: index 82 is out of bounds for axis 1 with size 80
Failed to featurize datapoint 407, CCC(C)CC(C)CCCCCCCCC(=O)N[C@H]1C[C@@H](O)[C@@H](NCCN)NC(=O)[C@@H]2[C@@H](O)CCN2C(=O)[C@H]([C@H](O)CCN)NC(=O)[C@H]([C@H](O)[C@@H](O)c2ccc(O)cc2)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@H]([C@@H](C)O)NC1=O. Appending empty arra

training set score: {'mean_squared_error': 25.441616215847095}
test set score: {'mean_squared_error': 23.810130556234252}


In [91]:
## graphconvmodel
loader = dc.data.CSVLoader(["JCHEM_PKA"], feature_field="SMILES", featurizer=dc.feat.ConvMolFeaturizer())
dataset = loader.create_dataset("dwar_pka1.csv")

splitter = dc.splits.RandomSplitter()
(train, test) = splitter.train_test_split(dataset=dataset)

graphconv_model = dc.models.GraphConvModel(n_tasks=1, mode='regression')

graphconv_model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
print('training set score:', graphconv_model.evaluate(train, [metric]))
print('test set score:', graphconv_model.evaluate(test, [metric]))

RDKit ERROR: [12:33:00] Explicit valence for atom # 2 O, 3, is greater than permitted
Failed to featurize datapoint 119, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
RDKit ERROR: [12:33:00] Explicit valence for atom # 0 N, 4, is greater than permitted
Failed to featurize datapoint 413, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
RDKit ERROR: [12:33:02] Explicit valence for atom # 13 Cl, 5, is greater than permitted
Failed to featurize datapoint 1697, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.r









training set score: {'mean_squared_error': 22.85586826675404}
test set score: {'mean_squared_error': 26.714683626877356}
