In [None]:
# default_exp train

# esol
> Using `molmapnets` for regression, tested on the [`eSOL`](http://www.tanpaku.org/tp-esol/index.php?lang=en) dataset.

In [None]:
#all_slow

In [None]:
%config Completer.use_jedi = False

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_theme(palette='Set2')
colors = sns.color_palette()
colors

In [None]:
#export
import torch
from torch import nn, optim
import torch.nn.functional as F
torch.set_default_dtype(torch.float64)

from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
from chembench import dataset
from molmap import MolMap

from molmapnets.models import MolMapRegression



## Feature extraction 

The `chembench` package collected several different datasets for benchmarking the models. Here we'll use the [`eSOL`](http://www.tanpaku.org/tp-esol/index.php?lang=en) dataset, which collects the solubility of all E.coli proteins. The data can be loaded with

In [None]:
data = dataset.load_ESOL()

total samples: 1128


We have the smiles (Simplified Molecular Input Line Entry Specification) for different proteins and their corresponding solubility meansure:

In [None]:
data.df.head()

Unnamed: 0,smiles,measured log solubility in mols per litre
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.77
1,Cc1occc1C(=O)Nc2ccccc2,-3.3
2,CC(C)=CCCC(C)=CC(=O),-2.06
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.87
4,c1ccsc1,-1.33


Using MolMap we can extract features from using the smiles as input. We can specify the feature type `ftype`,  feature pairwise distance calculation method `metric`, and feature grid arrangement method `fmap_type`:

In [None]:
MolMap?

[0;31mInit signature:[0m
[0mMolMap[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mftype[0m[0;34m=[0m[0;34m'descriptor'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mflist[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfmap_type[0m[0;34m=[0m[0;34m'grid'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfmap_shape[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplit_channels[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetric[0m[0;34m=[0m[0;34m'cosine'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvar_thr[0m[0;34m=[0m[0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      <no docstring>
[0;31mInit docstring:[0m
paramters
-----------------
ftype: {'fingerprint', 'descriptor'}, feature type
flist: feature list, if you want use some of the features instead of all features, each element in flist should be the id of a feature
fmap_shape: 

In [None]:
descriptor = MolMap(ftype='descriptor', metric='cosine',)

In [None]:
fingerprint = MolMap(ftype='fingerprint', metric='cosine')

After setting up the feature extracting method, we can then use the `.fit` method of the feature object to extract the features. During this step we need to specify the algorithm (`method`) to embed higher dimensional features to 2D presentation: 

In [None]:
descriptor.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)

2021-07-22 21:38:23,335 - INFO - [bidd-molmap] - Applying grid feature map(assignment), this may take several minutes(1~30 min)
2021-07-22 21:38:26,510 - INFO - [bidd-molmap] - Finished


In [None]:
fingerprint.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=10,)

  "Using precomputed metric; transform will be unavailable for new data"


2021-07-22 21:38:54,356 - INFO - [bidd-molmap] - Applying grid feature map(assignment), this may take several minutes(1~30 min)
2021-07-22 22:11:46,397 - INFO - [bidd-molmap] - Finished


And we can then visualise the feature maps

In [None]:
descriptor.plot_grid()

2021-07-22 22:11:46,423 - INFO - [bidd-molmap] - generate file: ./descriptor_1344_cosine_umap_molmap
2021-07-22 22:11:46,504 - INFO - [bidd-molmap] - save html file to ./descriptor_1344_cosine_umap_molmap


In [None]:
fingerprint.plot_grid()

2021-07-22 22:11:46,553 - INFO - [bidd-molmap] - generate file: ./fingerprint_15846_cosine_umap_molmap
2021-07-22 22:11:46,866 - INFO - [bidd-molmap] - save html file to ./fingerprint_15846_cosine_umap_molmap


In [None]:
descriptor.plot_scatter()

2021-07-22 22:11:47,061 - INFO - [bidd-molmap] - generate file: ./descriptor_1344_cosine_umap_scatter
2021-07-22 22:11:47,117 - INFO - [bidd-molmap] - save html file to ./descriptor_1344_cosine_umap_scatter


In [None]:
fingerprint.plot_scatter()

2021-07-22 22:11:47,149 - INFO - [bidd-molmap] - generate file: ./fingerprint_15846_cosine_umap_scatter
2021-07-22 22:11:47,523 - INFO - [bidd-molmap] - save html file to ./fingerprint_15846_cosine_umap_scatter


## Regression using the descriptor map

In [None]:
X = descriptor.batch_transform(data.x)

100%|##########| 1128/1128 [08:45<00:00,  2.35it/s]


In [None]:
X.shape

(1128, 37, 37, 13)

In PyTorch the training data for computer vision problems takes the shape `(n_channels, hight, width)`, while the features extracted from `MolMap` take the shape `(hight, width, n_channels)`, so we'll first correct it by moving the channels dimension before the feature map dimensions.

In [None]:
torch.movedim(torch.from_numpy(X), -1, 1).shape

torch.Size([1128, 13, 37, 37])

In [None]:
Y = data.y

In [None]:
Y.shape

(1128, 1)

Now from these feature maps we can create the dataset suitable for training models in PyTorch

In [None]:
#export
class SingleFeatureData(Dataset):
    """Process single feature map for model training.
    y: target
    X: feature map
    """
    def __init__(self, y, X, transform=None, target_transform=None):
        self.y = torch.from_numpy(y)
        self.X = torch.movedim(torch.from_numpy(X), -1, 1)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx]
        t = self.y[idx]
        if self.transform:
            x = self.transform(x)
        if self.target_transform:
            t = self.target_transform(t)
        return x, t

In [None]:
esol = SingleFeatureData(data.y, X)

In [None]:
train, val, test = random_split(esol, [904,112,112], generator=torch.Generator().manual_seed(7))

In [None]:
len(train), len(val), len(test)

(904, 112, 112)

In [None]:
train_loader = DataLoader(train, batch_size=8, shuffle=True)
val_loader = DataLoader(val, batch_size=8, shuffle=True)
test_loader = DataLoader(test, batch_size=8, shuffle=True)

And we can get one batch of data by making the data loader iterable

In [None]:
x, t = next(iter(train_loader))

In [None]:
t

tensor([[-2.3560],
        [-2.4840],
        [-1.0600],
        [ 0.3900],
        [ 0.9600],
        [-2.1700],
        [-2.7300],
        [-0.6200]])

In [None]:
x.shape

torch.Size([8, 13, 37, 37])

Finally with the data prepared we can train the models. These are tests to show that the models work as expected, but we can certainly fine tune the model to achieve better results.

In [None]:
model = MolMapRegression()

epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

And the training loop

In [None]:
for epoch in range(epochs):

    running_loss = 0.0
    for i, (xb, yb) in enumerate(train_loader):

        xb, yb = xb.to(device), yb.to(device)

        # zero gradients
        optimizer.zero_grad()

        # forward propagation
        pred = model(xb)

        # loss calculation
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i+1) % 50 == 0:    
            print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / (i+1)))

print('Training finished')

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[Epoch: 1, Iter:    50] Training loss: 6.567
[Epoch: 1, Iter:   100] Training loss: 5.444
[Epoch: 2, Iter:    50] Training loss: 3.304
[Epoch: 2, Iter:   100] Training loss: 2.579
[Epoch: 3, Iter:    50] Training loss: 2.061
[Epoch: 3, Iter:   100] Training loss: 1.807
[Epoch: 4, Iter:    50] Training loss: 1.179
[Epoch: 4, Iter:   100] Training loss: 1.294
[Epoch: 5, Iter:    50] Training loss: 1.191
[Epoch: 5, Iter:   100] Training loss: 1.397
Training finished


Loss on validation data set

In [None]:
running_loss = 0.0
with torch.no_grad():
    for i, (xb, yb) in enumerate(val_loader):

        xb, yb = xb.to(device), yb.to(device)

        # forward propagation
        pred = model(xb)

        # loss calculation
        loss = criterion(pred, yb)
        running_loss += loss.item()
        if (i+1) % 3 == 0:    
            print('[Iter: %5d] Validation loss: %.3f' %
                    (i + 1, running_loss / (i+1)))


[Iter:     3] Validation loss: 0.732
[Iter:     6] Validation loss: 0.950
[Iter:     9] Validation loss: 0.959
[Iter:    12] Validation loss: 0.983


## Regression using the fingerprint map

In [None]:
X_fingerprint = fingerprint.batch_transform(data.x)

100%|##########| 1128/1128 [03:37<00:00,  5.55it/s]


In [None]:
X_fingerprint.shape

(1128, 126, 126, 12)

Now from these feature maps we can create the dataset suitable for training models in PyTorch

In [None]:
esol_fingerprint = SingleFeatureData(data.y, X_fingerprint)

In [None]:
train_fingerprint, val_fingerprint, test_fingerprint = random_split(esol_fingerprint, [904,112,112], generator=torch.Generator().manual_seed(7))

In [None]:
len(train), len(val), len(test)

(904, 112, 112)

In [None]:
train_loader_fingerprint = DataLoader(train_fingerprint, batch_size=8, shuffle=True)
val_loader_fingerprint = DataLoader(val_fingerprint, batch_size=8, shuffle=True)
test_loader_fingerprint = DataLoader(test_fingerprint, batch_size=8, shuffle=True)

And we can get one batch of data by making the data loader iterable

In [None]:
x, t = next(iter(train_loader_fingerprint))

In [None]:
t.shape

torch.Size([8, 1])

In [None]:
x.shape

torch.Size([8, 12, 126, 126])

And regression. Different feature maps have different number of channels.

In [None]:
model_fingerprint = MolMapRegression(conv_in1=12)

epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_fingerprint.to(device)
optimizer = optim.Adam(model_fingerprint.parameters(), lr=0.001)
criterion = nn.MSELoss()

And the training loop

In [None]:
for epoch in range(epochs):

    running_loss = 0.0
    for i, (xb, yb) in enumerate(train_loader_fingerprint):

        xb, yb = xb.to(device), yb.to(device)

        # zero gradients
        optimizer.zero_grad()

        # forward propagation
        pred = model_fingerprint(xb)

        # loss calculation
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i+1) % 50 == 0:    
            print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / (i+1)))

print('Training finished')

[Epoch: 1, Iter:    50] Training loss: 4.537
[Epoch: 1, Iter:   100] Training loss: 3.935
[Epoch: 2, Iter:    50] Training loss: 2.271
[Epoch: 2, Iter:   100] Training loss: 1.954
[Epoch: 3, Iter:    50] Training loss: 1.258
[Epoch: 3, Iter:   100] Training loss: 1.157
[Epoch: 4, Iter:    50] Training loss: 1.027
[Epoch: 4, Iter:   100] Training loss: 0.880
[Epoch: 5, Iter:    50] Training loss: 0.660
[Epoch: 5, Iter:   100] Training loss: 0.649
Training finished


Loss on validation data set

In [None]:
running_loss = 0.0
with torch.no_grad():
    for i, (xb, yb) in enumerate(val_loader_fingerprint):

        xb, yb = xb.to(device), yb.to(device)

        # forward propagation
        pred = model_fingerprint(xb)

        # loss calculation
        loss = criterion(pred, yb)
        running_loss += loss.item()
        if (i+1) % 3 == 0:    
            print('[Iter: %5d] Validation loss: %.3f' %
                    (i + 1, running_loss / (i+1)))


[Iter:     3] Validation loss: 0.442
[Iter:     6] Validation loss: 0.599
[Iter:     9] Validation loss: 0.708
[Iter:    12] Validation loss: 0.851


## Regression using both feature maps

If we want to use both the feature maps, we have to process the training data differently.

In [None]:
#export
class DoubleFeatureData(Dataset):
    """Process single feature map for model training.
    y: target
    X: tuple of two feature maps
    """
    def __init__(self, y, X, transform=None, target_transform=None):
        X1, X2 = X
        self.y = torch.from_numpy(y)
        self.X1 = torch.movedim(torch.from_numpy(X1), -1, 1)
        self.X2 = torch.movedim(torch.from_numpy(X2), -1, 1)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        x1 = self.X1[idx]
        x2 = self.X2[idx]
        t = self.y[idx]
        if self.transform:
            x1 = self.transform(x1)
            x2 = self.transform(x2)
        if self.target_transform:
            t = self.target_transform(t)
        return (x1, x2), t

Now we can feed both the feature maps to the model as a tuple

In [None]:
double_feature = DoubleFeatureData(data.y, (X, X_fingerprint))

In [None]:
train_double, val_double, test_double = random_split(double_feature, [904,112,112], generator=torch.Generator().manual_seed(7))

In [None]:
len(train_double), len(val_double), len(test_double)

(904, 112, 112)

In [None]:
train_loader_double = DataLoader(train_double, batch_size=8, shuffle=True)
val_loader_double = DataLoader(val_double, batch_size=8, shuffle=True)
test_loader_double = DataLoader(test_double, batch_size=8, shuffle=True)

And we can get one batch of data by making the data loader iterable

In [None]:
x, t = next(iter(train_loader_double))

In [None]:
t.shape

torch.Size([8, 1])

In [None]:
x1, x2 = x
x1.shape, x2.shape

(torch.Size([8, 13, 37, 37]), torch.Size([8, 12, 126, 126]))

And regression. Different feature maps have different number of channels.

In [None]:
model_double = MolMapRegression(conv_in1=13, conv_in2=12)

epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_double.to(device)
optimizer = optim.Adam(model_double.parameters(), lr=0.001)
criterion = nn.MSELoss()

And the training loop

In [None]:
for epoch in range(epochs):

    running_loss = 0.0
    for i, ((x1, x2), yb) in enumerate(train_loader_double):

        x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)

        # zero gradients
        optimizer.zero_grad()

        # forward propagation
        pred = model_double((x1, x2))

        # loss calculation
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i+1) % 50 == 0:    
            print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / (i+1)))

print('Training finished')

[Epoch: 1, Iter:    50] Training loss: 6.050
[Epoch: 1, Iter:   100] Training loss: 4.608
[Epoch: 2, Iter:    50] Training loss: 2.854
[Epoch: 2, Iter:   100] Training loss: 2.403
[Epoch: 3, Iter:    50] Training loss: 1.834
[Epoch: 3, Iter:   100] Training loss: 1.536
[Epoch: 4, Iter:    50] Training loss: 1.027
[Epoch: 4, Iter:   100] Training loss: 0.996
[Epoch: 5, Iter:    50] Training loss: 0.595
[Epoch: 5, Iter:   100] Training loss: 0.677
Training finished


Loss on validation data set

In [None]:
running_loss = 0.0
with torch.no_grad():
    for i, ((x1, x2), yb) in enumerate(val_loader_double):

        x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)

        # forward propagation
        pred = model_double((x1, x2))

        # loss calculation
        loss = criterion(pred, yb)
        running_loss += loss.item()
        if (i+1) % 3 == 0:    
            print('[Iter: %5d] Validation loss: %.3f' %
                    (i + 1, running_loss / (i+1)))

print('Validation finished')

[Iter:     3] Validation loss: 1.191
[Iter:     6] Validation loss: 1.088
[Iter:     9] Validation loss: 1.002
[Iter:    12] Validation loss: 0.874
Validation finished
