In [1]:
import pandas as pd

data = pd.read_csv('iGB3externel.csv')


In [2]:
import numpy as np
data['Smiles'] = np.nan

In [3]:
data.head()

Unnamed: 0,Peptides,Sequences,Smiles,Pubchem_CID,residue,IC50 (μmol/L)
0,Pep2,R-G-D-X183-V,,,,
1,Pep3,R-G-D-X71-V,,176873.0,N-methyl-d-phenylalanine,
2,Pep4,R-G-D-X183-K,,,,
3,Pep5,R-G-D-X718-K,,,,
4,Pep6,R-G-D-X183-C,,,,


In [5]:
from utils.util import build_peptide_from_smiles
# from utils.getsmiles_f import compute_features

In [6]:
GPepT_monomer = pd.read_csv('../../dictionary.tsv', sep = '\t')
GPepT_monomer.head()

Unnamed: 0,ID,SMILES,Pubchem,ChEMBL,Tautomers,Bond sites,Functional Groups,Weight (g/mol),ClogP,Topological Polar Surface Area (Å),...,Tanimoto Similarity with L-Glycine,Fraction sp3,Frequency,Number of bonds,MolLogP,Fraction Aromatic Rings,Fraction Rotatable Bonds,Fraction Hydrogen Bond Acceptors,Fraction Hydrogen Bond Donors,RULE OF FIVE
0,L,CC(C)C[C@@H](C(=O)O)N,Leucine,"['CHEMBL1232258', 'CHEMBL291962', 'CHEMBL46575']",,,"defaultdict(<class 'int'>, {'CN': 1, 'CC(=O)O'...",131.094629,0.4444,63.32,...,0.269231,0.833333,24257,8,0.4444,0.0,0.375,0.375,0.375,True
1,R,C(C[C@@H](C(=O)O)N)CNC(=N)N,Arginine,"['CHEMBL1485', 'CHEMBL212301']",,,"defaultdict(<class 'int'>, {'CC(=O)O': 1, 'CN'...",174.111676,-1.33843,125.22,...,0.2,0.666667,24052,11,-1.33843,0.0,0.454545,0.545455,0.636364,False
2,K,C(CCN)C[C@@H](C(=O)O)N,Lysine,"['CHEMBL319497', 'CHEMBL28328', 'CHEMBL8085']",,,"defaultdict(<class 'int'>, {'CN': 2, 'CC(=O)O'...",146.105528,-0.4727,89.34,...,0.275862,0.833333,20373,9,-0.4727,0.0,0.555556,0.444444,0.555556,True
3,G,C(C(=O)O)N,Glycine,"['CHEMBL3707233', 'CHEMBL773']",,,"defaultdict(<class 'int'>, {'CC(=O)O': 1, 'CN'...",75.032028,-0.9703,63.32,...,1.0,0.5,18729,4,-0.9703,0.0,0.25,0.75,0.75,True
4,A,C[C@@H](C(=O)O)N,Alanine,"['CHEMBL66693', 'CHEMBL279597', 'CHEMBL12198']",,,"defaultdict(<class 'int'>, {'CC(=O)O': 1, 'CN'...",89.047678,-0.5818,63.32,...,0.333333,0.666667,17523,5,-0.5818,0.0,0.2,0.6,0.6,True


In [7]:
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize

def canonical_smiles(smiles: Chem.Mol) -> str:
    mol = Chem.MolFromSmiles(smiles) # copy

    # 1) 温和 sanitize（不 kekulize）
    Chem.SanitizeMol(
        mol,
        Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE
    )

    # 2) 互变异构 canonicalization —— 保留 sp3 手性（关键！）
    te = rdMolStandardize.TautomerEnumerator()
    te.SetRemoveSp3Stereo(False)   # <<< 这行是救命的
    mol = te.Canonicalize(mol)

    # 3) 重新分配立体化学（保险）
    Chem.AssignStereochemistry(
        mol,
        force=True,
        cleanIt=True
    )

    # 4) canonical SMILES（保留手性）
    return Chem.MolToSmiles(
        mol,
        canonical=True,
        isomericSmiles=True,
        kekuleSmiles=False
    )

In [8]:
smiles_map = GPepT_monomer.set_index('ID')['SMILES'].to_dict()
for i, row in data[data['Smiles'].isna()].iterrows():
    print(row['Sequences'])
    peptides_seq = row['Sequences'].split('-')
    smiles_list = [smiles_map.get(seq) for seq in peptides_seq]
    smiles_list = [canonical_smiles(smi) for smi in smiles_list]
    # print(smiles_list)
    data.loc[data['Sequences'] == row['Sequences'], 'Smiles'] = build_peptide_from_smiles(smiles_list)
    

R-G-D-X183-V
R-G-D-X71-V
R-G-D-X183-K
R-G-D-X718-K
R-G-D-X183-C
R-W-X501
R-W-X501-N-M
X166-R-G-D
R-W-X501-N-K
G-R-G-X674-T-P


  data.loc[data['Sequences'] == row['Sequences'], 'Smiles'] = build_peptide_from_smiles(smiles_list)


R-G-D-K-X113-L-P-E-T
X113-G-R-G-D-X1800-X523-X113
X113-G-R-G-D-X1800-X260-X113
X113-G-R-G-D-X1800-X77-X113
X113-G-X109-G-D-X1800-X523-X113
G-G-R-G-D-K-X113
A-D-X23-F
G-H-I
K-L-M-N-X45
X355-Q-R-X20-T
V-W-Y-X101
X202-C-D-E-F
G-H-I-K-X50
M-N-P-Q-R-S
T-V-W-Y-X113-C
X1007-E-F-G-H-I
K-L-M-N-P-X303
Q-R-S-T-V-X2005
X1527-A-C-D-E
F-G-H-I-K-X404
L-M-N-X3009-Q-R
X4187-T-V-W-Y-A
X505-C-D-E-F-G
X5178-I-K-L-M-N-X606
P-Q-R-S-T-V
W-Y-A-C-D-X707
X102-X505-X602-X910-X1176-X707
X5672-X473-X298-X2983-X707
X238-X589-X2133-X3467
X126-X23-X3984-X457-X209-X28
X437-X6587-X9132-X2
X12-A-K-E
R-X45-D
G-X8-P-W
X1000-S-T
H-X22-E-L
X732-D-X15
N-K-X909
S-X5-F-R
X3300-A-X71
D-E-X120
X4-G-X88-P
L-X600-K-S
M-X256-D-F-L
A-X1-G-X45-R
X900-E-L-P-K
S-T-X33-F-G
X5500-A-X6-D
P-H-X1024-S-E
K-X78-M-X901
X23-R-A-D-V
F-X400-S-T-Y
E-L-X10-P-X85
X678-X999-G-A
D-X301-W-X7-F
X12-A-K-E-X555
R-X45-D-F-L-P
G-X8-P-W-M-X2
X1000-S-T-H-E-R
H-X22-E-L-K-X450
X732-D-X15-A-S-G
N-K-X909-P-E-T
S-X5-F-R-X123-W
X3300-A-X71-D-K-L
D-E-X120-V-X88-G
X4

[15:35:46] Tautomer enumeration stopped at 201 tautomers: max transforms reached


R-G-D-X4279-K
R-G-D-X7313-K
R-W-X3252
F-X5791-G-D-S-Q-I-Q-T-R-R-S
F-R-X768-D-S-Q-I-Q-T-R-R-S


[15:35:47] Tautomer enumeration stopped at 610 tautomers: max transforms reached


F-R-G-X11183-S-Q-I-Q-T-R-R-S
R-W-X824-N-M
X2883-R-G-D
R-W-X824-N-K
G-R-G-X8734-T-P
R-G-D-K-X4954-L-P-E-T
X113-G-R-G-D-X1800-X8225-X113
X113-G-R-G-D-X1800-X648-X113
X113-G-R-G-D-X1800-X8537-X113
X113-G-X393-G-D-X1800-X523-X113
G-G-R-G-D-K-X4954


In [9]:
data.head()

Unnamed: 0,Peptides,Sequences,Smiles,Pubchem_CID,residue,IC50 (μmol/L)
0,Pep2,R-G-D-X183-V,CC(C)[C@H](NC(=O)[C@@H](Cc1ccccc1)NC(=O)[C@H](...,,,
1,Pep3,R-G-D-X71-V,CC(C)[C@H](NC(=O)[C@@H](Cc1ccccc1)N(C)C(=O)[C@...,176873.0,N-methyl-d-phenylalanine,
2,Pep4,R-G-D-X183-K,N=C(N)NCCC[C@H](N)C(=O)NCC(=O)N[C@@H](CC(=O)O)...,,,
3,Pep5,R-G-D-X718-K,N=C(N)NCCC[C@H](N)C(=O)NCC(=O)N[C@@H](CC(=O)O)...,,,
4,Pep6,R-G-D-X183-C,N=C(N)NCCC[C@H](N)C(=O)NCC(=O)N[C@@H](CC(=O)O)...,,,


In [10]:
data.to_csv('iGB3externel.csv', index=False)

In [None]:
from rdkit import Chem
mol = Chem.MolFromSequence('GHI')
smi = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
print(smi) #C[C@@H](O)[C@H](NC(=O)C[C@H](NC(=O)CNC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N1CCC[C@H]1C(=O)O)C(=O)O
#C[C@@H](O)[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N1CCC[C@H]1C(=O)O
#C[C@@H](O)[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)N1CCC[C@H]1C(=O)O

CC[C@H](C)[C@H](NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)CN)C(=O)O


In [11]:
mol1 = 'Cc1nc(CCC(=O)NCCOc2ccc(OCC(=O)N[C@H](C(=O)O)C(C)(C)C)cc2)ccc1-c1cnc(NCc2c(F)ccc3c2CCO3)n2cnnc12'
#CC(C)[C@@H](C(=O)O)N
print(canonical_smiles(mol1))#N[C@@H](Cc1cnc[nH]1)C(=O)O


Cc1nc(CCC(=O)NCCOc2ccc(OCC(=O)N[C@H](C(=O)O)C(C)(C)C)cc2)ccc1-c1cnc(NCc2c(F)ccc3c2CCO3)n2cnnc12


[15:36:29] Tautomer enumeration stopped at 201 tautomers: max transforms reached


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

mol = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)CN)C(=O)O')
fp = AllChem.GetMorganFingerprint(mol, radius=2, useChirality=True)
print(fp.GetNonzeroElements())

{29410177: 1, 74537039: 1, 512485612: 1, 584893129: 1, 772927515: 1, 817554650: 1, 847957139: 1, 847961216: 2, 864662311: 1, 864942730: 3, 868576692: 1, 899522707: 1, 1100037548: 1, 1362518133: 2, 1506563592: 1, 1510328189: 3, 1533864325: 1, 1552912706: 1, 1739265633: 1, 1840891614: 1, 2041434490: 1, 2085926208: 1, 2132511834: 1, 2245273601: 3, 2245384272: 3, 2246699815: 3, 2246728737: 2, 2261212172: 1, 2592785365: 1, 2648927651: 1, 2654043257: 1, 2697110228: 1, 2752034647: 1, 3054531336: 1, 3099695679: 1, 3135357859: 1, 3217380708: 1, 3218693969: 2, 3315826729: 1, 3362854265: 1, 3537119515: 1, 3542456614: 1, 3566760038: 1, 3824944396: 1, 3855312692: 1, 4078658161: 1, 4126130471: 1, 4222851645: 1}
