In [1]:
import sys
sys.path.insert(0, '/mnt/shenwanxiang/Research/ACANet/')
import torch
from clsar.dataset import LSSNS, HSSMS
from clsar.feature import Gen39AtomFeatures
from clsar.model.model import ACANet_PNA, get_deg, _fix_reproducibility # model
from clsar.model.loss import ACALoss, get_best_cliff,get_label_mask
_fix_reproducibility(42)
from clsar import ACANet
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from torch_geometric.loader import DataLoader

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

meta = 'https://bidd-group.github.io/MPCD/dataset/HSSMS/MoleculeACE_benchmark/metadata/datasets.csv'
meta_table = pd.read_csv(meta)
datasets = meta_table.Dataset.tolist()     

In [3]:
def y_to_pIC50(y):
    pIC50 = -np.log10((10**-y)*1e-9)
    return pIC50

def pIC50_to_y(pIC50):
    y = -np.log10(10**-pIC50*1e9)
    return y

def count_valid_triplets(labels: torch.Tensor,
                         cliff_lower: float = 0.2,
                         cliff_upper: float = 1.0) -> int:
    """
    返回满足 (i,j,k) 条件的三元组数量，而不创建 [B,B,B] 级别的 mask。
    复杂度 O(B^2)，显存非常小。
    """

    labels = labels.view(-1)
    B = labels.size(0)

    # 计算 |y_i - y_j|
    dist = torch.abs(labels.unsqueeze(1) - labels.unsqueeze(0))  # [B, B]

    # positive (i,j)
    pos = (dist < cliff_lower)       # [B,B]
    # neg (i,k)
    neg = (dist >= cliff_upper)      # [B,B]

    # 去掉 i == j
    diag = torch.eye(B, dtype=torch.bool, device=labels.device)
    pos = pos & ~diag
    neg = neg & ~diag

    # counts
    pos_counts = pos.sum(dim=1)   # [B] 第 i 行所有 j
    neg_counts = neg.sum(dim=1)   # [B] 第 i 行所有 k

    # 最终 triplet 数量：对每个 i，pos_count[i] × neg_count[i]
    total = (pos_counts * neg_counts).sum().item()

    return int(total)


def get_n(y_train_pIC50, cl, cu, device):
    labels = torch.tensor(y_train_pIC50).to(device)
    m1 = count_valid_triplets(labels, cl, cu)
    return m1

def get_n1(Xs_train, y_train_pIC50, device, cl, cu):
    
    data = ACANet()._Xy_to_dataset(Xs_train, y_train_pIC50)
    loader = DataLoader(data, batch_size=1024, shuffle=True)

    x = 0
    for data in loader:
        # 假设 data.y 是 [batch_size] 的活性张量
        y = data.y.to(device)
        mask = get_label_mask(y, device, cl, cu)
        n_mined = int(mask.sum().item())

        if n_mined > x:
            x = n_mined 
    return x

In [26]:
device = torch.device('cuda:0') # if torch.cuda.is_available() else 'cpu'

final_res = []
for dataset_name in ['CHEMBL287_Ki']: #['CHEMBL4203_Ki']:
    
    save_dir = '../benchmark_performance/mae_aca_opt_cliff/%s' % dataset_name
    
    df = pd.read_csv('https://raw.githubusercontent.com/bidd-group/MPCD/main/dataset/HSSMS/MoleculeACE_benchmark/%s.csv' % dataset_name)
    df_train = df[df.split == 'train']
    df_test = df[df.split == 'test']
    Xs_train = df_train.smiles.values
    y_train = df_train.y.values
    
    ## convert y to pIC50
    y_train_pIC50 = y_to_pIC50(y_train)
    
    dfcliff = pd.read_csv(os.path.join(save_dir, 
                                       'cliff_performance.csv'), 
                          index_col = 0)

    res = []
    for idx, ts in tqdm(dfcliff.iterrows(), ascii=True):
        cl, cu, rmse, rmse_err = ts.cl, ts.cu, ts.rmse, ts.rmse_err
        #n = get_n(y_train_pIC50, cl, cu, device)
        n1 = get_n1(Xs_train, y_train_pIC50, device, cl, cu)
        
        res.append({'dataset':dataset_name,'cl':cl, 'cu':cu, 
                    'rmse':rmse, 'rmse_err':rmse_err, 'n':n, 'n1':n1})
    
    df1 = pd.DataFrame(res)
    final_res.append(df1)

15it [00:35,  2.39s/it]


In [27]:
df2 = pd.concat(final_res)

In [33]:
df2.to_csv('data.csv')