# Feature Engineering with SwissADME's feature list
On the original full dataset

### Imports

In [20]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import FilterCatalog
from rdkit.Chem import rdqueries
from rdkit.Chem import rdchem 
from rdkit.Chem import AllChem

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

### Import Data

In [23]:
# Import data and remove unnecessary header rows
cyto_assay = pd.read_csv('../data/train_data/cyto_assay_clean.csv')
smiles_merged = pd.read_pickle('../data/conversion_data/smiles_merged.pkl')
smiles_list = pd.read_csv('../data/conversion_data/smiles_only.csv')

In [24]:
mol_list = [Chem.MolFromSmiles(data[0]) for idx, data in smiles_list.iterrows()]
type(mol_list)

list

### Swiss Features (partial)

In [25]:
# adapted from src/cyp_chembl_et.ipynb in this project
class Features(object):
    
    def __init__(self, mol):
        
        if isinstance(mol, str):
            self.mol = Chem.MolFromSmiles(mol)
        else:
            self.mol = mol
            
    def h_bond_donors(self):
        return Chem.Lipinski.NumHDonors(self.mol)

    def h_bond_acceptors(self):
        return Chem.Lipinski.NumHAcceptors(self.mol)

    def molar_refractivity(self):
        return Chem.Crippen.MolMR(self.mol)

    def molecular_weight(self):
        return Descriptors.ExactMolWt(self.mol)

    def n_atoms(self):
        return self.mol.GetNumAtoms()

    def n_carbons(self):
        carbon = Chem.rdqueries.AtomNumEqualsQueryAtom(6)
        return len(self.mol.GetAtomsMatchingQuery(carbon))

    def n_heteroatoms(self):
        return Descriptors.rdMolDescriptors.CalcNumHeteroatoms(self.mol)

    def n_rings(self):
        return Descriptors.rdMolDescriptors.CalcNumRings(self.mol)

    def n_rot_bonds(self):
        return Chem.Lipinski.NumRotatableBonds(self.mol)

    def logp(self):
        return Descriptors.MolLogP(self.mol)

    def tpsa(self):
        return Descriptors.TPSA(self.mol)
    
    def bond_type(bond):
        return Chem.rdchem.Bond.GetBondType(bond)
    
    def is_conjugated(bond):
        return Chem.rdchem.Bond.GetIsConjugated(bond)
        
    def in_ring(bond):
        return Chem.rdchem.Atom.IsInRing(bond)
        
    def stereo(bond):
        return Chem.rdchem.Bond.GetStereo(bond)
    
    def get_atoms(self):
        return Chem.rdchem.Mol.GetAtoms(self.mol)
    
    def get_bonds(self, atom):
        return Chem.rdchem.Atom.GetBonds(atom)
    
    def conformer(self):
        return Chem.rdchem.Conformer()
    
    def n_bonds(self):
        d = defaultdict(int)
        atoms = self.get_atoms()
        for atom in atoms:
            bonds = self.get_bonds(atom)
            for bond in bonds:
                d[Chem.rdchem.Bond.GetBondType(bond)] += 1
        return d
    
    def n_heavy_atoms(self):
        return Chem.rdchem.Mol.GetNumHeavyAtoms(self.mol)
    
    def n_aromatic_atom(self):
        count = 0
        aromatic = Chem.rdchem.Mol.GetAromaticAtoms(self.mol)
        for atom in aromatic:
            count += 1 
        return count
    
#     def w_sum_carb_hal(mol):
#         q = rdqueries.AtomNumEqualsQueryAtom(6)
#         return len(mol.GetAtomsMatchingQuery(q))
    
    

In [26]:
# Function to create features

def swiss_feat(mol_list):

    array = ['h_bond_donors', 'h_bond_acceptors', 'molar_refractivity', 'molecular_weight', 'n_atoms','n_carbons',
                 'n_heteroatoms', 'n_rings', 'n_rot_bonds', 'logp', 'tpsa', 'n_heavy_atoms', 'n_aromatic_atom']
        
    extra_col = ['single_bond', 'double_bond', 'triple_bond', 'aromatic_bond']

    feature_df = pd.DataFrame([], columns=array+extra_col)

    for i in mol_list:
        mol = Features(i)

        feature_arr = []
        for i in array:
            val = getattr(mol, i)()
            feature_arr.append(val)
        num_bonds = mol.n_bonds()
        feature_arr.append(num_bonds[Chem.rdchem.BondType.SINGLE])
        feature_arr.append(num_bonds[Chem.rdchem.BondType.DOUBLE])
        feature_arr.append(num_bonds[Chem.rdchem.BondType.TRIPLE])
        feature_arr.append(num_bonds[Chem.rdchem.BondType.AROMATIC])
        feature_df = feature_df.append(dict(zip(feature_df.columns, feature_arr)), ignore_index=True)
    return feature_df

In [32]:
feature_df = swiss_feat(mol_list)

In [36]:
feature_df.reset_index(inplace=True)

In [37]:
panel_name = pd.DataFrame(cyto_assay[['index', 'Inhibition Observed', 'Panel Name']])
panel_name.reset_index(inplace=True)

In [38]:
feature_df_merged = feature_df.merge(panel_name, how="inner", on="index")
feature_df_merged.to_pickle('../data/feature_df_merged.pkl')

In [40]:
inhibition_obs = cyto_assay[["index", "Inhibition Observed", "Panel Name"]]
swiss_feat_df = feature_df.merge(inhibition_obs, how="inner", on="index")

swiss_feat_df.to_pickle('../data/swiss_feat_test.pkl')

In [42]:
swiss_feat_df.head(10)

Unnamed: 0,index,h_bond_donors,h_bond_acceptors,molar_refractivity,molecular_weight,n_atoms,n_carbons,n_heteroatoms,n_rings,n_rot_bonds,logp,tpsa,n_heavy_atoms,n_aromatic_atom,single_bond,double_bond,triple_bond,aromatic_bond,Inhibition Observed,Panel Name
0,0,1.0,4.0,70.4422,306.995639,18.0,12.0,6.0,2.0,3.0,2.50942,67.49,18.0,11.0,12.0,4.0,0.0,22.0,1,0
1,1,1.0,4.0,85.0662,290.116761,22.0,17.0,5.0,3.0,4.0,2.6362,59.28,22.0,17.0,10.0,4.0,0.0,34.0,1,2
2,2,1.0,3.0,85.8431,298.131742,22.0,17.0,5.0,2.0,6.0,3.86462,72.24,22.0,12.0,18.0,4.0,0.0,24.0,1,4
3,3,1.0,1.0,59.1017,209.121592,15.0,12.0,3.0,1.0,4.0,3.2004,29.1,15.0,6.0,16.0,2.0,0.0,12.0,1,3
4,4,1.0,5.0,72.2607,269.080041,20.0,14.0,6.0,3.0,3.0,1.8572,73.34,20.0,12.0,16.0,4.0,0.0,24.0,1,1
5,5,1.0,4.0,70.5419,253.11365,17.0,13.0,4.0,2.0,3.0,3.1644,52.32,17.0,5.0,24.0,2.0,0.0,10.0,1,0
6,6,0.0,4.0,66.0385,246.089209,18.0,14.0,4.0,2.0,6.0,3.0698,48.67,18.0,11.0,14.0,2.0,0.0,22.0,1,2
7,7,0.0,5.0,74.639,275.072848,19.0,13.0,6.0,3.0,1.0,1.3841,54.79,19.0,6.0,24.0,6.0,0.0,12.0,1,4
8,8,1.0,6.0,74.6722,291.067762,20.0,13.0,7.0,3.0,3.0,2.124,73.34,20.0,11.0,20.0,2.0,0.0,22.0,1,3
9,9,1.0,4.0,78.6186,304.085935,22.0,15.0,7.0,2.0,5.0,3.3849,81.47,22.0,12.0,18.0,4.0,0.0,24.0,1,1
