In [2]:
from sklearn.model_selection import train_test_split
from rdkit import Chem
from tqdm import tqdm
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')  
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from rdkit.Chem import Draw, PandasTools


In [3]:
df = PandasTools.LoadSDF('/home/pjh/workspace/SOM/data/EBoMD_train_240322_v2.sdf')
test_df = PandasTools.LoadSDF('/home/pjh/workspace/SOM/data/EBoMD_test_240322_v2.sdf')

unique_reaction_type = [
    'Hydroxylation',
    'Epoxidation', 
    'Cleavage', 
    'N-Oxidation',
    'S-Oxidation',
    'Oxidation',
    'Rearrangement', 
    'Reduction', 
    'Dehydrogenation'
    ]

In [4]:
cyp_col = ['BOM_1A2', 'BOM_2A6', 'BOM_2B6', 'BOM_2C8', 'BOM_2C9', 'BOM_2C19', 'BOM_2D6', 'BOM_2E1', 'BOM_3A4',]
cyp2label = {
    'BOM_1A2' : 0,
    'BOM_2A6' : 1,
    'BOM_2B6' : 2,
    'BOM_2C8' : 3,
    'BOM_2C9' : 4,
    'BOM_2C19' : 5,
    'BOM_2D6' : 6,
    'BOM_2E1' : 7,
    'BOM_3A4' : 8
    }


In [5]:
def CYP_REACTION(x):
    cyp_reactions = x[cyp_col].tolist()
    cyp_reactions = [i for i in cyp_reactions if i] 
    return '\n'.join( cyp_reactions )

In [6]:
class_type = 3
cyp = 'CYP_REACTION'
df = PandasTools.LoadSDF('/home/pjh/workspace/SOM/data/EBoMD_train_240322_v2.sdf')
test_df = PandasTools.LoadSDF('/home/pjh/workspace/SOM/data/EBoMD_test_240322_v2.sdf')

df['CYP_REACTION'] = df.apply(CYP_REACTION, axis=1)
test_df['CYP_REACTION'] = test_df.apply(CYP_REACTION, axis=1)

train_df, valid_df = train_test_split(df, test_size=0.2)
train_df, valid_df = train_df.reset_index(drop=True), valid_df.reset_index(drop=True)


In [7]:
def reaction_type2label(x, class_type=1):
    if class_type == 1:
        # 0. Non-reaction or Others
        # 1. Cleavage
        # 2. Hydroxylation
        # 3. Oxidation
        reaction_type2label_dict = {
            'Hydroxylation' : 2,
            'Epoxidation' : 0, 
            'Cleavage' : 1, 
            'N-Oxidation': 0,
            'S-Oxidation': 0,
            'Oxidation' : 3,
            'Rearrangement' : 0, 
            'Reduction' : 0, 
            'Dehydrogenation' : 0
            }
    elif class_type==2:
        # 0. Non-reaction or Others
        # 1. Cleavage
        # 2. Hydroxylation | Oxidation    
        reaction_type2label_dict = {
            'Hydroxylation' : 2,
            'Epoxidation' : 0, 
            'Cleavage' : 1, 
            'N-Oxidation': 0,
            'S-Oxidation': 0,
            'Oxidation' : 2,
            'Rearrangement' : 0, 
            'Reduction' : 0, 
            'Dehydrogenation' : 0
            }    
    elif class_type == 3:
        return 1
    return reaction_type2label_dict[x]


def check_reaction(bond_atom_sidx, bond_atom_eidx, bond_atom_s_symbol, bond_atom_e_symbol, reaction, class_type = 1):
    bond_atom_sidx, bond_atom_eidx = bond_atom_sidx+1, bond_atom_eidx+1 # 0부터 시작해서 맞춰 줘야함

    if not reaction:
        return 0
    
    reaction_atoms, reaction_type, _ = reaction[1:-1].split(';')
    reaction_atom_s, reaction_atom_e = reaction_atoms.split(',')                        
    
    reaction_atom_s = int(reaction_atom_s)
    if reaction_atom_e == 'S':
        # S는 placeholder이라서 reaction_atom_s번째 S에서 반응임
        if ((bond_atom_sidx==reaction_atom_s) and (bond_atom_s_symbol == 'S')) or ((bond_atom_eidx==reaction_atom_s) and (bond_atom_e_symbol == 'S')):
            return reaction_type2label(reaction_type, class_type)

    elif reaction_atom_e == 'N':
        # S는 placeholder이라서 reaction_atom_s번째 S에서 반응임
        if ((bond_atom_sidx==reaction_atom_s) and (bond_atom_s_symbol == 'N')) or ((bond_atom_eidx==reaction_atom_s) and (bond_atom_e_symbol == 'N')):
            return reaction_type2label(reaction_type, class_type)

    elif reaction_atom_e == 'H':
        # H나reaction_atom_s번째 atom과 H의 반응임
        if ((reaction_atom_s == bond_atom_sidx) and (bond_atom_e_symbol == reaction_atom_e)) or ((reaction_atom_s == bond_atom_eidx) and (bond_atom_s_symbol == reaction_atom_e)):
            return reaction_type2label(reaction_type, class_type)    
    else:
        reaction_atom_e = int(reaction_atom_e)
        # 둘다 숫자인 경우는 해당 atom 인덱스 일치 여부 확인
        if ((bond_atom_sidx == reaction_atom_s) and (bond_atom_eidx == reaction_atom_e)) or ((bond_atom_sidx == reaction_atom_e) and (bond_atom_eidx == reaction_atom_s)):
            return reaction_type2label(reaction_type, class_type)
        
    return 0

def mol2bond_label(bonds, bonds_idx, reactions, class_type, return_reaction_type=False):
    
    bond_label = [0] * len(bonds)
    bond_reaction_type = [0] * len(bonds)
    
    for reaction in reactions:
        if not reaction:
            continue
        reaction_atoms, reaction_type, _ = reaction[1:-1].split(';')

        for n in range(len(bonds)):
            s_atom_idx, e_atom_idx, s_atom, e_atom = bonds_idx[n][0], bonds_idx[n][1], bonds[n][0], bonds[n][1]
            
            is_react = check_reaction(s_atom_idx, e_atom_idx, s_atom, e_atom, reaction, class_type)
            if not bond_label[n] and is_react:
                bond_label[n] = is_react
                bond_reaction_type[n] = reaction_type
    if return_reaction_type:
        return bond_label, bond_reaction_type
    return bond_label

In [8]:
for idx in range(df.shape[0]):
    # try:      
    reactions = df.loc[idx, cyp].split('\n')
    
    mol = df['ROMol'][idx]
    mol_h = AllChem.AddHs( mol, addCoords=True)
    smile =  Chem.MolToSmiles(mol)
    
    atoms = [i.GetSymbol() for i in AllChem.AddHs( mol, addCoords=True).GetAtoms()]
    bonds_idx = [(i.GetBeginAtomIdx(),i.GetEndAtomIdx()) for i in AllChem.AddHs( mol, addCoords=True).GetBonds()]
    bonds = [(i.GetBeginAtom().GetSymbol(),i.GetEndAtom().GetSymbol()) for i in AllChem.AddHs( mol, addCoords=True).GetBonds()]    

    bond_label = mol2bond_label(bonds, bonds_idx, reactions, class_type)      

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]
[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0