In [2]:
from math import sqrt
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
from rdkit import Chem
from scipy.stats import pearsonr
from tqdm import tqdm

import random

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'white', font_scale=2)

In [84]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances

In [85]:
def label_incoherence_index(X, y, k=10):
    nbrs = NearestNeighbors(n_neighbors=k+1).fit(X)
    _, indices = nbrs.kneighbors(X)
    
    incoherence_values = []
    for i in range(X.shape[0]):
        neighbors_labels = y[indices[i, 1:]]
        local_incoherence = np.mean(np.abs(neighbors_labels - y[i]))
        incoherence_values.append(local_incoherence)
    
    return np.mean(incoherence_values)

def feature_incoherence_index(X, k=5):
    nbrs = NearestNeighbors(n_neighbors=k+1, ).fit(X) #metric='cosine'
    distances, indices = nbrs.kneighbors(X)
    
    incoherence_values = []
    for i in range(X.shape[0]):
        neighbors = X[indices[i, 1:]]
        local_incoherence = np.mean(cosine_distances([X[i]], neighbors)[0])
        incoherence_values.append(local_incoherence)
    
    return np.mean(incoherence_values)

In [86]:
# embd_aca = pd.read_csv('./results/latent_space_with_aca.csv', index_col= 0)[['dim1', 'dim2']]
# embd_no_aca = pd.read_csv('./results/latent_space_no_aca.csv', index_col= 0)[['dim1', 'dim2']]

embd_aca = pd.read_csv('./results/embeddings_with_aca.csv', index_col= 0) #[['dim1', 'dim2']]
embd_no_aca = pd.read_csv('./results/embeddings_no_aca.csv', index_col= 0) #[['dim1', 'dim2']]

label = pd.read_csv('./results/chemical_space.csv', index_col= 0)

In [87]:
X1 = embd_aca.values
X2 = embd_no_aca.values

In [88]:
y = label.pChEMBL.values

In [89]:
lii = label_incoherence_index(X1, y)
print("基于模型（with-aca）隐空间的标签不连贯性:", lii)

基于模型（with-aca）隐空间的标签不连贯性: 0.199171875


In [90]:
lii = label_incoherence_index(X2, y)
print("基于模型（non-aca）隐空间的标签不连贯性:", lii)

基于模型（non-aca）隐空间的标签不连贯性: 0.28993749999999996


In [97]:
fii = feature_incoherence_index(X1)
fii

0.008533438520324624

In [98]:
fii = feature_incoherence_index(X2)
fii

0.029359810330929898

In [100]:
dim = 2048
from rdkit.Chem import AllChem
mols = [Chem.MolFromSmiles(s) for s in embd_aca.index]
ECFP4_fps = [AllChem.GetMorganFingerprintAsBitVect(x,2,dim) for x in tqdm(mols, ascii=True)]
ecfps = np.array([list(fp) for fp in ECFP4_fps])

100%|##########| 128/128 [00:00<00:00, 8390.97it/s]


In [95]:
lii = label_incoherence_index(ecfps, y)
print("基于结构的标签不连贯性:", lii)

基于结构的标签不连贯性: 0.48994531249999995


In [101]:
fii = feature_incoherence_index(ecfps)
fii

0.14218594244112037