In [28]:
import pandas as pd 
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

def get_fingerprint(mol):
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)

def get_fingerprint_list(mol_list):
    result = []
    for m in mol_list:
        result.append(get_fingerprint(Chem.MolFromSmiles(m)))
    return result

def cal_sim(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)

dataset = 'ALDH1'
data = pd.read_csv(f'../data/{dataset}/original/screen.csv')
positive = data[data['y'] == 1]
negative = data[data['y'] == 0]
mol_list = positive['smiles']
negative_mol_list = negative['smiles']
fp_list = get_fingerprint_list(positive['smiles'])
negative_fp_list = get_fingerprint_list(negative['smiles'])


195 4690 C[NH+](C)C1C([O-])=C(C(N)=O)C(=O)[C@@]2(O)C([O-])=C3C(=O)c4c([O-])cccc4[C@@](C)(O)[C@H]3[C@H](O)[C@H]12
195 4691 O=C(CC1(O)C(=O)N(C/C=C/c2ccccc2)c2ccccc21)c1ccccn1
195 4692 CCOc1ccc(NS(=O)(=O)c2ccc(NC(C)=O)c(OC)c2)cc1
195 4693 COc1ccc(CNC(=O)CN(C(=O)c2ccc(C[NH+]3CCOCC3)o2)C2CCCC2)cc1
195 4694 Oc1oc(-c2ccccc2Br)nc1/C=N/c1ccccn1
195 4695 CCOC(=O)c1c(C)c(P(=S)(N2CCOCC2)N2CCOCC2)n(C)c1C
195 4696 CCOC(=O)c1c(C)[nH]c(C2=N[NH+]=C(NC3CC3)SC2)c1C
195 4697 COc1cccc(C2(O)CCN(C(=O)C3CC3)CC2)c1
195 4698 CCN(CC)S(=O)(=O)c1ccc(N2CCOCC2)c(NC(=O)c2cc(O)nc3ccccc23)c1
195 4699 O=C(CCl)N(c1ccc(Cl)cc1)c1c[nH]c2ccccc12
195 4700 O=C(CSc1nc2ccccc2s1)N1CCCCC1
195 4701 Cc1cccc(NC(=O)CSc2nnc(CNC(=O)c3ccc(S(=O)(=O)N(C)C)cc3)n2-c2ccccc2)c1
195 4702 C=CCN1C(=O)C2(C(C(=O)c3ccc4c(c3)OCCO4)=C(O)C(=O)N2CCCOC)c2ccccc21
195 4703 CN(C)c1ccc(/C=N/CCC2(O)CCOC(C)(C)C2)cc1
195 4704 C=CCN1C(=O)N/C(=C/c2ccc(C)o2)C1=O
195 4705 CCOC(=O)c1ccc([N-]S(=O)(=O)c2cc(-c3nnnn3C(C)C)ccc2OC)cc1
195 4706 CCN(CC)S(=O)(=O)c1ccc(N2CCCC

KeyboardInterrupt: 

In [34]:
sim = [0 for _ in range(11)]

# for i in range(len(fp_list)):
#     for j in range(i+1, len(fp_list)):
#         similarity = cal_sim(fp_list[i], fp_list[j])
#         sim[int(similarity*10)] += 1
p_smiles = ['C[NH+](C)[C@@H]1C([O-])=C(C(N)=O)C(=O)[C@@]2(O)C([O-])=C3C(=O)c4c([O-])cccc4[C@@](C)(O)[C@H]3[C@H](O)[C@@H]12']
n_smiles = ['C[NH+](C)C1C([O-])=C(C(N)=O)C(=O)[C@@]2(O)C([O-])=C3C(=O)c4c([O-])cccc4[C@](C)(O)C3C(O)C12']
for i in range(300, len(fp_list)):
    a = 0
    for j in range(len(negative_fp_list)):
        if i == j:
            continue
        similarity = cal_sim(fp_list[i], negative_fp_list[j])
        a = max(a, similarity)
        if similarity == 1:
            print(i, j, list(positive['smiles'])[i], list(negative['smiles'])[j])
            p_smiles.append(list(positive['smiles'])[i])
            n_smiles.append(list(negative['smiles'])[i])
    sim[int(a*10)] += 1
for idx, s in enumerate(sim):
    print(idx, s)

442 49814 O=C([O-])C/C=C(\NC(=O)OCc1ccccc1)C(=O)OCc1ccccc1 O=C([O-])C/C=C(/NC(=O)OCc1ccccc1)C(=O)OCc1ccccc1
680 83812 C=C1C(=O)O[C@@H]2[C@H]3O[C@]3(C)CC/C=C(/C)CC[C@@H]12 C=C1C(=O)O[C@@H]2C3OC3(C)CC/C=C(\C)CC[C@@H]12
1161 35173 Cn1cccc1/C=C1/SC(=S)N(CC(=O)[O-])C1=O Cn1cccc1/C=C1\SC(=S)N(CC(=O)[O-])C1=O
1193 67362 CCO[C@@H]1OC(C(=O)NCc2ccccc2)=C[C@H](c2ccc3c(c2)OCO3)[C@H]1CCCO CCO[C@H]1OC(C(=O)NCc2ccccc2)=C[C@@H](c2ccc3c(c2)OCO3)[C@@H]1CCCO
1319 78470 O=C(NCc1ccccc1)C1=C[C@@H](C2CC2)C[C@@H](OCc2ccc(CO)cc2)O1 O=C(NCc1ccccc1)C1=C[C@H](C2CC2)C[C@H](OCc2ccc(CO)cc2)O1
1331 6546 O=C1NC2(CCCCC2)Oc2ccccc21 O=C1NC2(CCCCCC2)Oc2ccccc21
1593 12360 C[C@]([NH3+])(C(=O)[O-])c1ccc(-c2nnn[n-]2)cc1 CC([NH3+])(C(=O)[O-])c1ccc(-c2nnn[n-]2)cc1
1915 7239 O=[N+]([O-])c1cc(F)c(N2CCOCC2)cc1N1CCCCC1 O=[N+]([O-])c1cc(F)c(N2CCCCC2)cc1N1CCOCC1
1981 62623 O=C(c1cc([N+](=O)[O-])ccc1N1CCCCC1)N1CCCCC1 O=C(c1cc([N+](=O)[O-])ccc1N1CCCC1)N1CCCCC1
1998 23459 CC(=O)n1cc([C@@H]2C=C(C(=O)N3CC[NH+](Cc4ccc5c(c4)OCO5)CC3)O[C@H](

In [35]:
dataset = 'ALDH1'
for i in range(5):
    data = pd.read_csv(f'../results_0826/{dataset}_exploitation_mlp_ensemble5_seed5/{i}/picked.csv')
    data = data.transpose().reset_index(drop=True)

    mol_list = []
    for x in range(len(data)):
        mol_list.append(data.loc[x,0])
    
    p_dup_list = []
    n_dup_list = []
    for x in mol_list:
        for y in p_smiles:
            if x == y:
                p_dup_list.append(y)
        for y in n_smiles:
            if x == y:
                n_dup_list.append(y)
print(len(p_dup_list), len(n_dup_list))
print(p_dup_list)
print(n_dup_list)

1 1
['O=[N+]([O-])c1cc(F)c(N2CCOCC2)cc1N1CCCCC1']
['O=C(NNC(=S)NCc1ccccc1)c1ccc(Br)o1']


In [None]:
dataset = 'ALDH1'

data = pd.read_csv(f'../results_0826/{dataset}_exploitation_mlp_ensemble5_seed5/0/picked.csv')
data = data.transpose().reset_index(drop=True)

positive_list = []
for x in range(len(data)):
    if data.loc[x, 1] == '1':
        positive_list.append(get_fingerprint(Chem.MolFromSmiles(data.loc[x,0])))
# all_list = []
# for x in range(len(data)):
#     all_list.append(get_fingerprint(Chem.MolFromSmiles(data.loc[x,0])))
sim = [0 for _ in range(11)]
for i in range(len(positive_list)):
    a = 0
    for j in range(len(positive_list)):
        if i == j:
            continue
        similarity = cal_sim(positive_list[i], positive_list[j])
        a = max(a, similarity)
    sim[int(a*10)] += 1
for idx, s in enumerate(sim):
    print(idx, s)

0 0
1 0
2 3
3 10
4 38
5 55
6 61
7 50
8 28
9 6
10 0


In [9]:
sim = [0 for _ in range(11)]
for i in range(len(fp_list)):
    a = 0
    for j in range(len(negative_fp_list)):
        similarity = cal_sim(fp_list[i], negative_fp_list[j])
        a = max(a, similarity)
    sim[int(a*10)] += 1
for idx, s in enumerate(sim):
    print(idx, s)

0 0
1 0
2 0
3 2
4 14
5 67
6 70
7 57
8 13
9 0
10 0


In [7]:
sim = [0 for _ in range(11)]
for i in range(len(fp_list)):
    a = 0
    for j in range(len(fp_list)):
        if i == j:
            continue
        similarity = cal_sim(fp_list[i], fp_list[j])
        a = max(a, similarity)
    sim[int(a*10)] += 1
for idx, s in enumerate(sim):
    print(idx, s)

0 0
1 10
2 172
3 857
4 1251
5 991
6 731
7 598
8 309
9 41
10 26


In [25]:
pick = pd.read_csv('../results_0708/result_ALDH1_exploitation_mlp/0/picked.csv', index_col=0)

pick_pos = []
for i in pick.columns:
    if pick.loc[1, i] == '1':
        pick_pos.append(get_fingerprint(Chem.MolFromSmiles(pick.loc[0, i])))
print(len(pick_pos))
similarity = [0 for _ in range(11)]
for i in range(len(pick_pos)):
    for j in range(i+1, len(pick_pos)):
        sim = cal_sim(pick_pos[i], pick_pos[j])
        similarity[int(sim*10)] += 1
for idx, s in enumerate(similarity):
    print(idx, s*589)

206
0 1751686
1 6886588
2 2497360
3 865241
4 303335
5 83049
6 28272
7 15314
8 5301
9 589
10 0
