In [None]:
from model import Autoencoder, GNN3DFull
from dataset import LogPDataset, LogSDataset
import torch

# Loading LogP dataset
logp_dataset = LogPDataset("../data/logp")
print(logp_dataset)

# Loading LogS dataset
logs_dataset = LogSDataset("../data/logs")
print(logs_dataset)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Running Tests for LogS

In [None]:
def vary_dataset_size(dataset, dataset_name, n_epochs_autoencoders = 3, printstep = 1000, min_examples = 300, divisions = 3):

    atom_autoencoder = Autoencoder(80, 10).to(device)
    bond_autoencoder = Autoencoder(10, 3).to(device)
    
    mse_loss_fn = torch.nn.MSELoss()
    atom_autoencoder_optimizer = torch.optim.Adam(atom_autoencoder.parameters())
    bond_autoencoder_optimizer = torch.optim.Adam(bond_autoencoder.parameters())

    # Training autoencoders on dataset
    atom_rmse_dict = {"ep0": [], "ep1": [], "ep2": []}
    bond_rmse_dict = {"ep0": [], "ep1": [], "ep2": []}
    for epoch_i in range(n_epochs_autoencoders):
        atom_rmse = 0
        bond_rmse = 0
        for i, molecule in enumerate(dataset):
            atomic_vectors = molecule[0].to(device)
            bond_vectors = molecule[1].to(device)

            atomic_vectors_reconstructed = atom_autoencoder(atomic_vectors)
            bond_vectors_reconstructed = bond_autoencoder(bond_vectors)

            atom_autoencoder_loss = mse_loss_fn(atomic_vectors, atomic_vectors_reconstructed)
            bond_autoencoder_loss = mse_loss_fn(bond_vectors, bond_vectors_reconstructed)

            # Taking optimization step
            atom_autoencoder_optimizer.zero_grad()
            bond_autoencoder_optimizer.zero_grad()
            atom_autoencoder_loss.backward()
            bond_autoencoder_loss.backward()
            atom_autoencoder_optimizer.step()
            bond_autoencoder_optimizer.step()

            atom_rmse = (atom_rmse * i + torch.sqrt(atom_autoencoder_loss).item()) / (i + 1)
            bond_rmse = (bond_rmse * i + torch.sqrt(bond_autoencoder_loss).item()) / (i + 1)

            atom_rmse_dict["ep" + str(epoch_i)].append(atom_rmse)
            bond_rmse_dict["ep" + str(epoch_i)].append(bond_rmse)
            
            if i % printstep == 0 or i == len(dataset) - 1:
                print(f"epoch.{epoch_i}, ex.{i}, atom rmse: {atom_rmse}, bond rmse: {bond_rmse}")
    
    # Saving autoencoders
    torch.save(atom_autoencoder.state_dict(), "./models/poster/atom_autoencoder"+ dataset_name +".pth")
    torch.save(bond_autoencoder.state_dict(), "./models/poster/bond_autoencoder"+ dataset_name +".pth")

    # Training GNN3D
    
    example_step_size = int((600 - min_examples) / divisions) + 1

    gnn_rmse_dict = {}

    for division_i in range(divisions + 1):
        # Making an instance of the model and an optimizer
        gnn3d = GNN3DFull(atomic_vector_size= 10, bond_vector_size=3, number_of_molecular_features = 200, number_of_targets = 1).to(device)
        gnn3d_optimizer = torch.optim.Adam(gnn3d.parameters())
        
        avg_rmse = 0
        gnn_rmse_dict["d" + str(min_examples + example_step_size * division_i)] = []
        
        for i, molecule in enumerate(dataset):

            if (min_examples + example_step_size * division_i < i):
                break;

            target = molecule[8].to(device)
            input_representation = [
                    atom_autoencoder.encode(molecule[0].to(device)),
                    bond_autoencoder.encode(molecule[1].to(device)),
                    molecule[2].to(device),
                    molecule[3].to(device),
                    molecule[4].to(device),
                    molecule[5].to(device),
                    molecule[6].to(device),
                    molecule[7].to(device)]
    
            # Making prediction
            prediction = gnn3d(input_representation)
            
            # Computing losses
            loss = mse_loss_fn(target, prediction)
        
            # Taking optimization step
            gnn3d_optimizer.zero_grad()    
            loss.backward()
            gnn3d_optimizer.step()
        
            # Updating average losses
            avg_rmse = (avg_rmse * i + torch.sqrt(loss).item()) / (i + 1)
            gnn_rmse_dict["d" + str(min_examples + example_step_size * division_i)].append(avg_rmse)
    
            if (i % 20 == 0):
                print(f"Ep. {division_i + 1}/{divisions + 1}, Ex. {i}/{str(min_examples + example_step_size * division_i)}, avg rmse: {avg_rmse}, immediate mse: {loss.item()}, target: {target.item()}, pred: {prediction.item()}")

        #Saving model in this division
        torch.save(gnn3d.state_dict(), "./models/poster/gnn3d_"+dataset_name+"_div"+str(min_examples + example_step_size * division_i)+".pth")
    
    return atom_rmse_dict, bond_rmse_dict, gnn_rmse_dict

In [None]:
logs_atom_rmse_dict, logs_bond_rmse_dict, logs_gnn_rmse_dict = vary_dataset_size(logs_dataset, "logs", min_examples = 100, divisions=4)

In [None]:
import pickle

# Saving dictionaries
with open('./models/poster/loss_dicts/logs_atom_rmse_dict2.pkl', 'wb') as f:
    pickle.dump(logs_atom_rmse_dict, f)

with open('./models/poster/loss_dicts/logs_bond_rmse_dict2.pkl', 'wb') as f:
    pickle.dump(logs_bond_rmse_dict, f)

with open('./models/poster/loss_dicts/logs_gnn_rmse_dict2.pkl', 'wb') as f:
    pickle.dump(logs_gnn_rmse_dict, f)

In [None]:
logp_atom_rmse_dict, logp_bond_rmse_dict, logp_gnn_rmse_dict = vary_dataset_size(logp_dataset, "logp")

In [None]:
# Plots
from matplotlib import pyplot as plt

# Making atom rmse plot
for key in logp_bond_rmse_dict.keys():
    plt.plot(logp_bond_rmse_dict[key], label = key)
plt.legend()
plt.title("Bond Autoencoder Loss for LogP")
plt.savefig('./models/poster/plots/bond_rmse_logp.svg', format='svg')
plt.show()


[Download SVG Plot](./models/poster/plots/atom_rmse_logs.svg)

In [None]:
# Making plots of gnn3d
# Plot of each division
logp_gnn_rmse_dict.keys()
for i, key in enumerate(logp_gnn_rmse_dict.keys()):
    plt.clf()
    plt.plot(logp_gnn_rmse_dict[key], label = "dataset size = " + key[1:])
    plt.legend()
    plt.title("GNN3D loss in first epoch for logp")
    plt.savefig("./models/poster/plots/gnn3d_logp_" + str(i) +".svg", format='svg')

In [None]:
# Sizewise plots
from matplotlib import pyplot as plt

plt.clf()
x = []
y = []
for key in logs_gnn_rmse_dict.keys():
    x.append(int(key[1:]))
    y.append(logs_gnn_rmse_dict[key][-1])

plt.plot(x, y)
plt.title("RMSE Loss of GNN3D for LogS with increasing dataset size.")
plt.savefig("./models/poster/plots/loss_vary_dataset_logs2.svg", format="svg")
plt.show()