In [1]:
# --- Standard libraries
import pickle as pkl
import numpy as np
import heapq as hq
# --- PyTorch
import torch
from torch_geometric.data import Batch
# --- RDKit
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.molSize = 300,300
# --- Bokeh
from bokeh.io import output_notebook, show, export_png
from bokeh.layouts import row
from bokeh.models import Label
output_notebook()
# --- Science python
from sklearn.metrics import mean_squared_error
from scipy.stats import wasserstein_distance, spearmanr
# --- Modules from local files
from new_gnn import GNN
from utils.model import get_spec_prediction
from utils.utils import bokeh_spectra, calculate_rse, count_funct_group, bokeh_hist

### Coronene

#### Set model parameters

In [2]:
num_tasks = 200
num_layers = 3
emb_dim = 15
in_channels = [int(emb_dim), 64, 128]
out_channels = [64, 128, 256]
gnn_type = 'gcn'
heads = 1
drop_ratio = 0.5
graph_pooling = 'mean'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'model_gnn_atom_test.pt'

deg = torch.tensor((0,20378, 60422, 86692, 12316))

#### Load the saved model

In [3]:
# --- Load the saved model
num_tasks = 200

model = GNN(
    num_tasks,
    num_layers,
    emb_dim,
    in_channels,
    out_channels,
    gnn_type,
    heads,
    drop_ratio,
    graph_pooling,
)
model = model.to(device)
model.load_state_dict(torch.load("ML_models/" + model_name, map_location='cpu'))

<All keys matched successfully>

#### Load the test data

In [4]:
test_dataset = torch.load("./datasets/atom_test_dataset_long_7895.pt")
print(f'Length of test dataset: {len(test_dataset)}')

Length of test dataset: 895


#### Use the model to predict the test data

In [5]:
# --- Create empty dictionary to contain spectra
predict = {}
true = {}

# --- Loop over all molecules in test dateset and assign to dictionary index
for index in range(len(test_dataset)):
    predict[index], true[index] = get_spec_prediction(model, index, test_dataset, device)

# --- Parse spectra into dictionary
model_dict = [predict, true]

name = 'spectra_ml_atom_test.pkl'

with open('model_results/' + name, "wb") as file:
    pkl.dump(model_dict, file)

#### View predictions

In [6]:
file = open('model_results/' + name, 'rb')
data = pkl.load(file)

predict = data[0]
true = data[1]

#### Perform analysis of predictions

In [7]:
wasser = []
mse = []
rse = []

for x in range(len(predict)):
    # Wasserstein metric
    wass_temp = wasserstein_distance(predict[x], true[x])
    wasser.append(wass_temp)
    # Mean squared error
    mse_temp = mean_squared_error(predict[x], true[x])
    mse.append(mse_temp)
    # RSE
    rse_temp = calculate_rse(predict[x], true[x])
    rse.append(rse_temp)

print(f"Average Wasserstein distance = {sum(wasser) / len(wasser)}")
print(f"Average MSE = {sum(mse) / len(mse)}")
print(f'Average RSE = {sum(rse) / len(rse)}')

Average Wasserstein distance = 0.0450256200867876
Average MSE = 0.010294521528728015
Average RSE = 0.1919560448250957


In [8]:
rank_rse = hq.nsmallest(789, rse)

rank_graph = []

for x in range(789):
    rank_idx = rse.index(rank_rse[x])
    rank_graph.append(rank_idx)

print('The 5 best RSE values are:')
for x in range(5):
    print(f'RSE = {rank_rse[x]:.3f}, graph number = {rank_graph[x]}')

print('')
print('The 5 worst RSE values are:')
for x in range(-1, -6, -1):
    print(f'RSE = {rank_rse[x]:.3f}, graph number = {rank_graph[x]}')

The 5 best RSE values are:
RSE = 0.082, graph number = 251
RSE = 0.094, graph number = 63
RSE = 0.097, graph number = 303
RSE = 0.103, graph number = 429
RSE = 0.109, graph number = 760

The 5 worst RSE values are:
RSE = 0.245, graph number = 383
RSE = 0.244, graph number = 699
RSE = 0.243, graph number = 291
RSE = 0.241, graph number = 721
RSE = 0.241, graph number = 877


#### View and compare predictions

In [9]:
# --- Plot best spectra prediction
p1 = bokeh_spectra(predict[rank_graph[0]], true[rank_graph[0]])
p2 = bokeh_spectra(predict[rank_graph[1]], true[rank_graph[1]])
p3 = bokeh_spectra(predict[rank_graph[2]], true[rank_graph[2]])
p = row(p1, p2, p3)
show(p)
#export_png(p2, filename='ave.png')

In [10]:
# --- Plot worst spectra prediction
p1 = bokeh_spectra(predict[rank_graph[-1]], true[rank_graph[-1]])
p2 = bokeh_spectra(predict[rank_graph[-2]], true[rank_graph[-2]])
p3 = bokeh_spectra(predict[rank_graph[-3]], true[rank_graph[-3]])
p = row(p1, p2, p3)
show(p)
#export_png(p, filename='worst.png')

In [13]:
bins = np.linspace(0.05, 0.3, 35)
hist, edges = np.histogram(rank_rse, density=True, bins=bins)
average_rse = sum(rse) / len(rse)
p_hist = bokeh_hist(hist, edges, average_rse, 2)
l1 = Label(x=0.19, y=11, x_units='data', y_units='data', text='RSE = 0.166', text_font_size='24px')
p_hist.add_layout(l1)

show(p_hist)
#export_png(p_hist, filename='GO_atom_hist.png')