In [1]:
### For automatically reloading import modules... allows you to run changes to code in jupyter without having to reload
%load_ext autoreload
%autoreload 2

In [2]:
# import zipfile
# import h5py
import os
import sys
import time
import timeit
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split

from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.drawOptions.addAtomIndices = True

from photocatalysis.learners_treesearch import get_population_completed
from photocatalysis.deeplearning.helpers import get_charset, smiles_to_onehot, one_hot_to_smile

from photocatalysis.deeplearning.models import VAE, train_epoch
import torch.nn.functional as F

In [3]:
import torch
import torch.optim as optim
import torchinfo

from torch import nn as nn
torch.manual_seed(42)

print(f'CUDA GPU Available: {torch.cuda.is_available()}')

CUDA GPU Available: False


In [4]:
df = pd.read_json('/home/btpq/bt308495/Thesis/frames/DF_COMPLETE.json', orient='split')
df = get_population_completed(df)

In [5]:
input_size = 80
sos_token = 'X'

char_list, max_smi_len = get_charset(df.molecule_smiles, sos_token=sos_token)
data = smiles_to_onehot(df.molecule_smiles, char_list, input_size)

In [6]:
# 80/5/15 Train/Val/Test Split
data_train, data_test = train_test_split(data, test_size=0.2, shuffle=False)
data_valid, data_test = train_test_split(data_test, test_size=0.75, shuffle=False)

In [7]:
data_train_tensor = torch.from_numpy(data_train)
data_valid_tensor = torch.from_numpy(data_valid)
data_test_tensor = torch.from_numpy(data_test)

# data_train_tensor_loader = torch.utils.data.TensorDataset(data_train_tensor)
train_loader = torch.utils.data.DataLoader(data_train_tensor, batch_size=250, shuffle=True)
valid_loader = torch.utils.data.DataLoader(data_valid_tensor, batch_size=250, shuffle=True)
test_loader = torch.utils.data.DataLoader(data_test_tensor, batch_size=250, shuffle=True)

## Teacher forcing testing

In [14]:
goindex = np.where(char_list == 'X')[0][0]
gotoken = torch.FloatTensor(len(char_list)).zero_()
gotoken[goindex] = 1

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
gotoken.to(device)

# NEW MODEL
model = VAE(INPUT_SIZE=input_size,
            CHARSET_LEN=len(char_list),
            LATENT_DIM=292,
            filter_sizes=(9,9,10),
            kernel_sizes=(5,5,7),
            eps_std=1e-2,
            useTeacher=True,
            gotoken=gotoken,
            probabilistic_sampling=True).to(device)

optimizer = optim.Adam(model.parameters())
model.eval()

VAE(
  (conv_1): Conv1d(80, 9, kernel_size=(5,), stride=(1,))
  (conv_2): Conv1d(9, 9, kernel_size=(5,), stride=(1,))
  (conv_3): Conv1d(9, 10, kernel_size=(7,), stride=(1,))
  (linear_0): Linear(in_features=80, out_features=435, bias=True)
  (mean_linear_1): Linear(in_features=435, out_features=292, bias=True)
  (var_linear_2): Linear(in_features=435, out_features=292, bias=True)
  (linear_3): Linear(in_features=292, out_features=292, bias=True)
  (stacked_gru): GRU(292, 501, num_layers=3, batch_first=True)
  (terminalGRU): teacherGRU(
    (cell): advGRUCell(523, 501)
    (linear): Linear(in_features=501, out_features=22, bias=True)
  )
  (relu): ReLU()
)

# Teacher Forcing Devolopment

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# NEW MODEL
model = VAE(INPUT_SIZE=input_size,
            CHARSET_LEN=len(char_list),
            LATENT_DIM=292,
            filter_sizes=(9,9,10),
            kernel_sizes=(5,5,7),
            eps_std=1e-2).to(device)

optimizer = optim.Adam(model.parameters())
model.eval()

In [None]:
X = data_train_tensor[:2]
z = model.reparameterize(*model.encode(X))

In [None]:
# Pass latent vector through fully connected layer (output shape == input shape)
zout = F.selu(model.linear_3(z))

# Repeat latent vectors seq_len number of times (batchsize, seq_len, latent_dim)
zprime = zout.view(zout.size(0), 1, zout.size(-1)).repeat(1, model.INPUT_SIZE, 1)

In [None]:
out_ar, har = model.stacked_gru(zprime)
out_ar.shape

Character by character feeding into the GRU

In [None]:
zprime_char = zout.view(zout.size(0), 1, zout.size(-1))
zprime_char.shape

In [None]:
out0, hs0 = model.stacked_gru(zprime_char)
out0.shape, hs0.shape

In [None]:
out0[:, 0, :] == out_ar[:, 0, :]

In [None]:
out1, hs1 = model.stacked_gru(zprime_char, hs0)
out1.shape, hs1.shape

In [None]:
out1[:, 0, :] == out_ar[:, 1, :]

Putting it together

In [None]:
decoder_outputs = []
for i in range(model.INPUT_SIZE):
    if i == 0:
        out, hs = model.stacked_gru(zprime_char)
        decoder_outputs.append(out)
    else:
        out, hs = model.stacked_gru(zprime_char, hs)
        decoder_outputs.append(out)

out = torch.cat(decoder_outputs, dim=1)

In [None]:
torch.all(out_ar == out)

In [None]:
X.shape

In [None]:
21+292

In [None]:
torch.cat([z, X[:, 0, :]], dim=1).shape

In [None]:
xhat = model.decode(z)

In [None]:
xhat.shape

----

Datareading and GO token for teacher forcing?

In [None]:
import torch.utils.data as data
from torch.autograd import Variable

In [None]:
def replace_double(smi: str) -> str:
    for s, w in zip(['Br', 'Cl', 'Si'], ["Ö", "Ä", "Å"]):
        smi = smi.replace(s, w)
    return smi

class OnehotEncoder(object):
    def __init__(self, alphabet, sos_token="X", maxlen=120):
        # Start-of-seq token is 'X' by default
        alphabet = ''.join([" ", sos_token] + alphabet)
        alphabet = replace_double(alphabet)
        self.alphabet = {k: v for v, k in enumerate(alphabet)}

        self.maxlen = maxlen
        self.alphabetlen = len(alphabet)

    def __call__(self, smi: str):
        indices = torch.LongTensor(self.maxlen, 1)
        indices.zero_()
        smi = replace_double(smi.rstrip())
        for i, char in enumerate(smi):
            indices[i] = self.alphabet[char]
        one_hot = torch.zeros(self.maxlen, self.alphabetlen)
        one_hot.scatter_(1, indices, 1)
        return one_hot
    
class OnehotDecoder(object):

    def __init__(self, alphabet):
        self.alphabet = [" ", ""] + alphabet  # Replace GO token with empty string

    def decode(self, onehot: torch.FloatTensor):
        if onehot.dim() == 2:
            onehot = onehot[None, :, :]
        maxs, indices = torch.max(onehot, 2)
        smiles = []
        for i in range(indices.size()[0]):
            chars = [self.alphabet[index] for index in indices[i,].view(-1)]
            smiles.append("".join(chars).strip())
        return smiles

    def decode_int(self, inds: torch.LongTensor):
        if inds.dim() == 2:
            inds = inds[None, :, :]
        smiles = []
        for i in range(inds.size()[0]):
            chars = [self.alphabet[index] for index in inds[i,].view(-1)]
            smiles.append("".join(chars).strip())
            return smiles

In [None]:
class SMILESReader(data.Dataset):

    def __init__(self, smiles_list, alphabet, subset=(0, None), maxlen=120):
        self.onehotencoder = OnehotEncoder(alphabet, maxlen=maxlen)
        self.smiles = smiles_list[subset[0]:subset[1]]
        self.alphabet = self.onehotencoder.alphabet

    def __getitem__(self, index):
        smi = self.smiles[index]
        one_hot = self.onehotencoder(smi)
        cat_hot = torch.LongTensor(1).zero_()
        cat_hot[0] = -1
        return one_hot, cat_hot

    def __len__(self):
        return len(self.smiles)

In [None]:
alphabet = list(char_list[:-1])
smi_reader = SMILESReader(df.molecule_smiles.tolist(), alphabet, maxlen=80)
smi_decoder = OnehotDecoder(alphabet)

In [None]:
smi_decoder.decode(smi_reader[1][0])

In [None]:
smi_reader[0][0]

In [None]:
data_train_tensor[0]

In [None]:
smi_reader.alphabet

In [None]:
goindex = 1
batch_size = 256
gotoken = torch.FloatTensor(len(smi_reader.alphabet)).zero_()
gotoken[goindex] = 1

In [None]:
gotoken.repeat(batch_size, 1)

In [None]:
repeat_z = z.view(z.size(0), 1, z.size(-1)).repeat(1, model.INPUT_SIZE, 1)
o, h = model.stacked_gru(repeat_z)

In [None]:
%%timeit
out_independent = o.contiguous().view(-1, o.size(-1))
logits = model.linear_4(out_independent)

y_test = F.softmax(logits, dim=1)
y = y_test.contiguous().view(o.size(0), -1, y_test.size(-1))

In [None]:
%%timeit
F.softmax(model.linear_4(o), dim=1)

In [None]:
repeat_z.shape

In [None]:
no = nn.Linear(501, 21)(o)

In [None]:
a = F.softmax(no, dim=1)

In [None]:
a.shape

In [None]:
o.shape, out_independent.shape, logits.shape

In [None]:
logits.shape

# Teacher/Terminal GRU

In [None]:
alphabet = list(char_list[:-1]) #alphabet not including empty space char ' '... to be added in SMILESReader

smiles = df.molecule_smiles.tolist() # bare smiles list
smiles_go = df.molecule_smiles.map(lambda x: 'X'+x).tolist() # all smiles prepended with GO/SOS token

smi_reader = SMILESReader(smiles, alphabet, maxlen=80)
smi_reader_go = SMILESReader(smiles_go, alphabet, maxlen=80)

In [None]:
N_CHARS = len(smi_reader.alphabet)
goindex = smi_reader.alphabet['X']
gotoken = torch.FloatTensor(N_CHARS).zero_()
gotoken[goindex] = 1

In [None]:
X = torch.stack([smi_reader[i][0] for i in range(len(smi_reader))])
XGO = torch.stack([smi_reader_go[i][0] for i in range(len(smi_reader))])

In [None]:
def sample_gumbel(input):
    noise = torch.rand(input.size())
    if input.is_cuda:
        noise = noise.cuda()
    eps = 1e-9
    noise.add_(eps).log_().neg_()
    noise.add_(eps).log_().neg_()
    return Variable(noise)

def gumbel_softmax_sample(input_, hard=True, temperature=1., uselogprop=True):
    # Softmax has some undesired behaviour in pytorch 0.1.12:
    # if the input is 2D then it operates over dimension 1, if the input is 3D it operates on dimension 0
    # thus, to sample a 3D tensor (timestep, batch, chars) correctly, we have to reshape it to (timestep * batch, chars)
    # before we can continue
    size = tuple(input_.size())
    if input_.dim() >= 3:
        input_ = input_.view(-1, input_.size(-1))
    noise = sample_gumbel(input_)
    if uselogprop:
        a = (input_ + noise) / temperature
        a = F.log_softmax(a)
    else:
        a = (torch.log(input_) + noise) / temperature
        a = F.softmax(a)

    if hard == True:
        _, max_inx = torch.max(a, a.dim() - 1)
        if a.is_cuda:
            a_hard = torch.cuda.FloatTensor(a.size()).zero_().scatter_(a.dim() - 1, max_inx.data, 1.0)
        else:
            print(a.shape)
            print((a.dim() - 1, max_inx.data, 1.0))
            a_hard = torch.FloatTensor(a.size()).zero_().scatter_(a.dim() - 1, max_inx.data, 1.0)
        a2 = a.clone()
        tmp = Variable(a_hard - a2.data)
        tmp.detach_()

        a = tmp + a

    return a.view(size)

In [None]:
class advGRUCell(nn.GRUCell):
    def __init__(self, input_size, hidden_size, activation=F.tanh, inner_activation=F.sigmoid):
        super(advGRUCell, self).__init__(input_size, hidden_size, bias=True)
        self.activation = activation
        self.inner_activation = inner_activation

    def forward(self, input, hx=None):
        if hx is None:
            hx = torch.autograd.Variable(input.data.new(
                input.size(0),
                self.hidden_size).zero_(), requires_grad=False)
        gi = F.linear(input, self.weight_ih, self.bias_ih)
        gh = F.linear(hx, self.weight_hh, self.bias_hh)

        i_r, i_z, i_n = gi.chunk(3, 1)
        h_r, h_z, h_n = gh.chunk(3, 1)

        resetgate = self.inner_activation(i_r + h_r)
        inputgate = self.inner_activation(i_z + h_z)
        preactivation = i_n + resetgate * h_n
        newgate = self.activation(preactivation)
        hy = newgate + inputgate * (hx - newgate)

        return hy, preactivation

In [None]:
class oldteacherGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, activation=F.log_softmax,
                 gru_activation=F.tanh, gru_inner_activation=F.sigmoid, useTeacher=True,
                 gotoken=None, multinomial=True):
        if useTeacher and gotoken is None:
            raise ValueError("Need to provide a gotoken when using teachers forcing")
        super(oldteacherGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.teacher = useTeacher
        self.gotoken = gotoken
        self.multinomial = multinomial
        
        if useTeacher:
            self.cell = advGRUCell(input_size=input_size + output_size, hidden_size=hidden_size,
                                   activation=gru_activation, inner_activation=gru_inner_activation)
        else:
            self.cell = advGRUCell(input_size=input_size, hidden_size=hidden_size,
                                   activation=gru_activation, inner_activation=gru_inner_activation)
            
        if self.multinomial:
            self.sample = torch.multinomial
        else:
            def topi(matrix, top):
                return torch.topk(matrix, top)[1]
            self.sample = topi

        self.linear = nn.Linear(hidden_size, output_size)
        self.activation = activation

    def forward(self, y, groundTruth=None, hx=None, max_length=None, temperature=0.5):
        # if self.teacher and self.training and groundTruth is None:
        #    raise NotImplementedError("No groundTruth in teachers trainingsmode")
        batch_size = y.size(0)
        seq_length = y.size(1)
        if max_length is None:
            max_length = seq_length

        output = []
        sampled_output = []
        preactivation = []
        if hx is None:
            # Initialize hidden-state as zeros
            # hx = Variable(x.data.new(batch_size, self.hidden_size).zero_(), requires_grad=False)
            hx = y.data.new(batch_size, self.hidden_size).zero_()
            
        for i in range(max_length):
            if self.teacher and i == 0:
                # gotoken_target = Variable(self.gotoken.repeat(batch_size, 1), requires_grad=False)
                # input_ = torch.cat([x[:, i, :], gotoken_target.type_as(x)], dim=-1)
                gotoken_target = self.gotoken.repeat(batch_size, 1)
                input_ = torch.cat([y[:, i, :], gotoken_target], dim=-1)
                hx, pre = self.cell(input_, hx=hx)

            elif self.teacher and groundTruth is not None:
                target = groundTruth[:, i - 1, :]
                input_ = torch.cat([y[:, i, :], target], dim=-1)
                hx, pre = self.cell(input_, hx=hx)

            elif self.teacher and groundTruth is None:
                target = sampled_output[-1]
                input_ = torch.cat([y[:, i, :], target], dim=-1)
                hx, pre = self.cell(input_, hx=hx)
                
            elif not self.teacher:
                input_ = y[:, i, :]
                hx, pre = self.cell(input_, hx=hx)
            else:
                raise NotImplementedError("TeacherGRU. Unknown operation mode")
            
            output_ = self.activation(self.linear(hx)) #project into charset space with log_softmax activation
            output.append(output_.view(batch_size, 1, self.output_size))
            preactivation.append(pre.view(batch_size, 1, self.hidden_size))

            ### Gumbel sampling
            # sampled_output.append(gumbel_softmax_sample(output_, hard=True, temperature=temperature, uselogprop=True))

            indices = self.sample(torch.exp(output_), 1)
            one_hot = output_.data.new(output_.size(0), self.output_size).zero_() # originally was self.hidden_size, although i think this is a mistake
            one_hot.scatter_(1, indices, 1)
            # one_hot = Variable(one_hot)
            sampled_output.append(one_hot)

        output = torch.cat(output, 1) # log probabilites
        preactivation = torch.cat(preactivation, 1)
        sampled_output = torch.stack(sampled_output, 1)

        return output, preactivation, sampled_output, hx

In [None]:
class teacherGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,
                 gru_activation=F.tanh, gru_inner_activation=F.sigmoid,
                 gotoken=None, state_dict=None, probabilistic_sampling=True):
        
        if gotoken is None:
            raise ValueError("Need to provide a gotoken when using teachers forcing")
        
        super(teacherGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.gotoken = gotoken
        
        self.cell = advGRUCell(input_size=input_size + output_size, hidden_size=hidden_size, 
                               activation=gru_activation, inner_activation=gru_inner_activation)

        self.linear = nn.Linear(hidden_size, output_size)

        if state_dict is not None:
            self.cell.load_state_dict(state_dict[0])
            self.linear.load_state_dict(state_dict[1])

        if probabilistic_sampling:
            self.sample = torch.multinomial
        else:
            def topi(matrix, top):
                return torch.topk(matrix, top)[1]
            self.sample = topi

    def forward(self, y, groundTruth=None, hx=None):
        batch_size = y.size(0)
        seq_length = y.size(1)

        output = []
        sampled_output = []
        preactivation = []

        target = self.gotoken.repeat(batch_size, 1)

        if hx is None:
            hx = y.data.new(batch_size, self.hidden_size).zero_()

        for i in range(seq_length):
            input_ = torch.cat([y[:, i, :], target], dim=-1)
            hx, pre = self.cell(input_, hx=hx)
            output_ = F.log_softmax(self.linear(hx), dim=1)
            
            # Sampling
            probs = torch.exp(output_)
            indices = self.sample(probs, 1)
            one_hot = output_.data.new(output_.size(0), self.output_size).zero_() # originally was self.hidden_size, although i think this is a mistake
            one_hot.scatter_(1, indices, 1)

            # Construct output lists
            output.append(probs.view(batch_size, 1, self.output_size))
            preactivation.append(pre.view(batch_size, 1, self.hidden_size))
            sampled_output.append(one_hot)

            if groundTruth is not None:
                # Teacher force actual ground-truth
                target = groundTruth[:, i, :]
            else:
                # Feed in own prediction
                target = one_hot
        
        output = torch.cat(output, 1) # log probabilites
        preactivation = torch.cat(preactivation, 1)
        sampled_output = torch.stack(sampled_output, 1)
        
        # output probabilities instead of log probs
        return output, preactivation, sampled_output, hx

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# NEW MODEL
model = VAE(INPUT_SIZE=input_size,
            CHARSET_LEN=len(char_list)+1,
            LATENT_DIM=292,
            filter_sizes=(9,9,10),
            kernel_sizes=(5,5,7),
            eps_std=1e-2).to(device)

optimizer = optim.Adam(model.parameters())
model.eval()

x = X[:10]
z = model.reparameterize(*model.encode(x))
zout = F.selu(model.linear_3(z))
zprime = zout.view(zout.size(0), 1, zout.size(-1)).repeat(1, model.INPUT_SIZE, 1)
o, h = model.stacked_gru(zprime)

In [None]:
model.training

In [None]:
output, preactivation, sampled_output, hx = teachergru.forward(o, groundTruth=x)

In [None]:
outputt, preactivationt, sampled_outputt, hxt = terminalgru.forward(o, groundTruth=x)

In [None]:
sampled_output == sampled_outputt

In [None]:
output[:, 0, :].data.new(output[:, 0, :].size(0), terminalgru.output_size).zero_().scatter_(1, topi, 1)

In [None]:
one_hot.scatter_(1, indices, 1)

In [None]:
torch.all(output == outputt)

In [None]:
teachergru.output == terminalgru.output

In [None]:
output[0], outputt[0]

Loss

In [None]:
def categorical_crossentropy(y_pred, y_true, batch_average=True, timestep_average=True):
    # scale preds so that the class probas of each sample sum to 1
    cumsum = torch.sum(y_pred, dim=-1)[:, :, None].repeat(1, 1, y_pred.size()[-1])  # need to repeat until we have keepdim from master
    y_pred /= cumsum
    # manual computation of crossentropy
    epsilon = 1E-7
    output = F.hardtanh(y_pred, min_val=epsilon, max_val=1. - epsilon)
    loss = -torch.sum(y_true.detach() * torch.log(output))

    if batch_average:
        loss /= y_pred.size()[0]
    if timestep_average:
        loss /= y_pred.size()[1]
    return loss

In [None]:
probs = torch.exp(output)


In [None]:
categorical_crossentropy(probs, x)

In [None]:
probs_shaped = probs.view(-1, probs.size(-1))
x_shaped = x.view(-1, x.size(-1))

In [None]:
F.cross_entropy(probs_shaped, x_shaped, reduction='mean')

In [None]:
F.binary_cross_entropy(probs_shaped, x_shaped, reduction='mean')

----

In [None]:
from photocatalysis.deeplearning.models import cyclical_annealing, linear_annealing

In [None]:
T = 200
steps = [i for i in range(0, T)]

In [None]:
betas = [cyclical_annealing(s, T, 4) for s in steps]
betas_lin = [linear_annealing(s, T) for s in steps]

In [None]:
plt.plot(steps, betas)
plt.plot(steps, betas_lin)

In [None]:
def perturb_z(z, noise_norm, num_samples=1, constant_norm=False):
    assert z.ndim == 1, 'Can only process one latent vector z of shape 1xLATENT_DIM'
    # Generate points that lie uniformly on a sphere centered at z of radius 'AT' or 'UPTO' noise_norm
    # Draw a noise vector from std norm distb. and normalize it
    z = np.tile(z, (num_samples, 1))
    noise_vec = np.random.normal(0, 1, size=z.shape)
    noise_vec /= np.linalg.norm(noise_vec, axis=1)[:, None]
    if constant_norm:
        # at noise_norm std deviations away... 
        return z + noise_norm * noise_vec
    else:
        # upto noise_norm std deviations away (draw multiple norms within the sphere R < noise_norm)
        noise_amp = np.random.uniform(0, noise_norm, size=(z.shape[0], 1))
        return z + noise_amp * noise_vec

In [None]:
Z = np.random.randn(292)

In [None]:
new_Z = perturb_z(Z, 5)

-----

In [None]:
from photocatalysis.deeplearning.models import VAE, train_epoch

In [None]:
# Model and training definitions
torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 120

# NEW MODEL
model = VAE(INPUT_SIZE=input_size,
            CHARSET_LEN=len(char_list),
            LATENT_DIM=292,
            filter_sizes=(5,5,6),
            kernel_sizes=(5,5,7)).to(device)

# LOAD PREV MODEL
# LOAD PREV MODEL
# model = VAE()
# model.load_state_dict(torch.load('/content/drive/MyDrive/VAE_model_parmas.pt', map_location=torch.device(device)))
# model.to(device)

optimizer = optim.Adam(model.parameters())

In [None]:
# batch_size = 1
# torchinfo.summary(VAE(), input_size=(batch_size, 120, 33))
# torchinfo.summary(model, input_size=(batch_size, input_size, len(char_list)))

In [None]:
# Training
tls, vls = [], []
for epoch in range(1, epochs+1):
    training_losses, validation_loss = train_epoch(train_loader,
                                                   model,
                                                   optimizer,
                                                   validation_data_loader=valid_loader,
                                                   device=device,
                                                   charset=char_list,
                                                   epoch=epoch)
    
    tls.append(training_losses), vls.append(validation_loss)
    break