## Initialization

In [11]:
from plapt import Plapt
import pandas as pd
from scipy.stats import spearmanr, pearsonr
import numpy as np

plapt = Plapt()

[0;93m2024-09-04 21:45:41.691906617 [W:onnxruntime:, graph.cc:1312 Graph] Initializer ProtOnly/Slice_Starts appears in graph inputs and will not be treated as constant value/weight. This may prevent some of the graph optimizations, like const folding. Move it out of graph inputs if there is no need to override it, by either re-generating the model with latest exporter/converter or with the tool onnxruntime/tools/python/remove_initializer_from_input.py.[m
[0;93m2024-09-04 21:45:41.691917603 [W:onnxruntime:, graph.cc:1312 Graph] Initializer ProtOnly/Slice_Ends appears in graph inputs and will not be treated as constant value/weight. This may prevent some of the graph optimizations, like const folding. Move it out of graph inputs if there is no need to override it, by either re-generating the model with latest exporter/converter or with the tool onnxruntime/tools/python/remove_initializer_from_input.py.[m
[0;93m2024-09-04 21:45:41.691920223 [W:onnxruntime:, graph.cc:1312 Graph] Initi

# Test_2016_290

In [12]:
benchmark_data = pd.read_csv("data/Test2016_290.csv")
    
# Extract sequences and smiles from benchmark dataset
prot_seqs = benchmark_data['seq'].tolist()
mol_smiles = benchmark_data['smiles_can'].tolist()
experimental_pKd = benchmark_data['neg_log10_affinity_M'].tolist()

In [13]:
predictions = plapt.predict_affinity(prot_seqs,mol_smiles)

In [14]:
predicted_pKd = [d['neg_log10_affinity_M'] for d in predictions]

# Ensure the lengths of the lists are the same
if len(predicted_pKd) != len(experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
predicted_pKd = np.array(predicted_pKd)
experimental_pKd = np.array(experimental_pKd)

# Calculate MSE
mse = np.mean((predicted_pKd - experimental_pKd) ** 2)

# Calculate MAE
mae = np.mean(np.abs(predicted_pKd - experimental_pKd))

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(predicted_pKd, experimental_pKd)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")

MSE: 1.43082343160439
MAE: 0.9060019385568028
RMSE: 1.1961703188109918
Spearman's Correlation: 0.8314992191087003


# CSAR HiQ 36

In [15]:
benchmark_data = pd.read_csv("data/CSAR-HiQ_36.csv")
    
# Extract sequences and smiles from benchmark dataset
prot_seqs = benchmark_data['seq'].tolist()
mol_smiles = benchmark_data['smiles_can'].tolist()
experimental_pKd = benchmark_data['neg_log10_affinity_M'].tolist()

In [16]:
predictions = plapt.predict_affinity(prot_seqs,mol_smiles)

In [17]:
predicted_pKd = [d['neg_log10_affinity_M'] for d in predictions]

# Ensure the lengths of the lists are the same
if len(predicted_pKd) != len(experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
predicted_pKd = np.array(predicted_pKd)
experimental_pKd = np.array(experimental_pKd)

# Calculate MSE
mse = np.mean((predicted_pKd - experimental_pKd) ** 2)

# Calculate MAE
mae = np.mean(np.abs(predicted_pKd - experimental_pKd))

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(predicted_pKd, experimental_pKd)
pearson_corr, _ = pearsonr(predicted_pKd, experimental_pKd)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")
print(f"Pearsons's Correlation: {pearson_corr}")

MSE: 1.8194395087681423
MAE: 1.1572850341267054
RMSE: 1.3488660084560447
Spearman's Correlation: 0.7423423423423423
Pearsons's Correlation: 0.7314860081349226


## Custom Benchmark

In [19]:
from datasets import load_dataset, Dataset
import random
random.seed(2101)
benchmark_data = load_dataset("jglaser/binding_affinity")['train'].select(random.sample(range(10001,20001), 1000))

In [20]:
benchmark_data.to_csv("data/benchmark1k2101.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 85.47ba/s]


440114

In [21]:
prot_seqs = benchmark_data['seq']
mol_smiles = benchmark_data['smiles_can']
experimental_pKd = benchmark_data['neg_log10_affinity_M']

In [22]:
predictions = plapt.predict_affinity(prot_seqs,mol_smiles)

In [23]:
predicted_pKd = [d['neg_log10_affinity_M'] for d in predictions]

# Ensure the lengths of the lists are the same
if len(predicted_pKd) != len(experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
predicted_pKd = np.array(predicted_pKd)
experimental_pKd = np.array(experimental_pKd)

# Calculate MSE
mse = np.mean((predicted_pKd - experimental_pKd) ** 2)

# Calculate MAE
mae = np.mean(np.abs(predicted_pKd - experimental_pKd))

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(predicted_pKd, experimental_pKd)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")

MSE: 0.8505419047962388
MAE: 0.6883422367572785
RMSE: 0.9222482880419127
Spearman's Correlation: 0.8821429045140452
