# Feature engineering

> In this module, we develop tools to extract features from compounds

In [1]:
#| default_exp feature

In [12]:
#| hide
import sys
sys.path.append("/notebooks/tools")
from tools.dataset import *

In [9]:
#| hide
from nbdev.showdoc import *
%matplotlib inline

In [3]:
#| export
import seaborn as sns
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler

In [4]:
#| export
from fastbook import *
from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
from fairscale.nn.wrap import enable_wrap, wrap
import esm
from tqdm.notebook import tqdm; tqdm.pandas()
import gc

In [5]:
#| export
def smi2prop(df, # df needs to have SMILES an ID columns
             smi_colname = "SMILES", # column name of smiles
             id_colname = "ID", # column name of ID
             remove_duplicate=True, # remove features that are same across compounds
             normalize = False, # normalize features using StandardScaler()
            ):
    "Extract 208 features from smiles via rdkit.Chem.Descriptors, and remove duplicate features"
    
    mols = [Chem.MolFromSmiles(smi) for smi in df[smi_colname]]
    desc_names = [desc_name[0] for desc_name in Descriptors.descList]
    desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)
    desc_values = [desc_calc.CalcDescriptors(mol) for mol in mols]
    compound = pd.DataFrame(np.stack(desc_values).T, index=desc_names,columns=df[id_colname])
    if remove_duplicate:
        compound = compound.loc[~compound.duplicated()] # remove duplicates
        compound = compound.loc[compound.std(axis=1) != 0] # remove compound that has same value across features
    compound = compound.T.reset_index()
    if normalize:
        scaler = StandardScaler()
        transformed = scaler.fit_transform(compound.iloc[:,1:])
        compound.iloc[:,1:] = transformed
    return compound

In [10]:
show_doc(smi2prop)

---

[source](https://github.com/sky1ove/tools/blob/main/tools/feature.py#L18){target="_blank" style="float:right; font-size:smaller"}

### smi2prop

>      smi2prop (df, smi_colname='SMILES', id_colname='ID',
>                remove_duplicate=True, normalize=False)

Extract 208 features from smiles via rdkit.Chem.Descriptors, and remove duplicate features

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df |  |  | df needs to have SMILES an ID columns |
| smi_colname | str | SMILES | column name of smiles |
| id_colname | str | ID | column name of ID |
| remove_duplicate | bool | True | remove features that are same across compounds |
| normalize | bool | False | normalize features using StandardScaler() |

In [13]:
df = get_g12d()

(722, 10) ['ID', 'SMILES', 'group', 'with_3F', 'racemic_trans', 'mixture_isomer', 'trans', 'Kd', 'IC50', 'erk_IC50']


In [14]:
df.shape

(722, 10)

In [15]:
smi2prop(df,normalize=True)

Unnamed: 0,ID,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,fr_COO2,fr_C_O,fr_C_O_noCOO,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amide,fr_aniline,fr_aryl_methyl,fr_bicyclic,fr_ester,fr_ether,fr_guanido,fr_halogen,fr_imidazole,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_para_hydroxylation,fr_piperdine,fr_piperzine,fr_priamide,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_thiazole,fr_unbrch_alkane,fr_urea
0,US_1,-1.932028,0.932171,1.381110,0.495241,-1.183587,-1.219109,-1.182076,-1.046640,-0.203173,-1.070885,0.960805,-0.201299,-0.606818,-0.027786,0.238330,-0.929849,0.169612,-1.291745,0.519884,-1.519547,0.185658,-0.425142,0.608875,0.337035,-0.603426,-1.036679,-0.801448,-0.917005,-0.932184,-0.777188,-0.867430,-0.807099,-0.902348,-0.806015,-0.900582,-0.864011,-0.972370,-0.126939,-0.154995,-1.075533,-0.754664,-0.327962,-0.976340,0.813537,-1.109378,-1.722754,-0.237511,-0.150438,-0.258243,-0.690788,-1.677478,-0.252705,-0.275917,-0.455194,0.586341,-0.695775,0.424056,-0.818465,-0.302768,-0.193272,0.123156,-0.361079,-0.742411,-0.199686,-0.279138,0.655344,-0.36044,-1.709988,0.908283,-0.727266,0.045041,-0.772241,-1.454431,-0.791409,0.740918,-0.673271,0.606634,0.257682,-1.120582,-0.123426,-1.959448,-1.622800,-1.137956,0.581413,0.436907,-0.082673,0.600444,-0.316618,0.410895,-1.477411,-0.713491,-0.020944,0.966588,0.316115,0.706608,1.074237,0.035838,-0.307140,0.688654,-0.495195,-0.975006,0.504157,0.034921,-0.393,-0.446513,-0.613431,0.241193,0.123187,0.298974,0.164340,0.642889,-1.190554,-0.645101,-0.356435,-0.308046,-0.465984,-0.535647,-0.875193,-0.643206,-0.037242,-0.289541,-0.278433,-0.145659,-0.016211,-0.164399,1.260085,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,-0.098784,-0.261324,-0.215353,-0.051678,1.432247,1.722508,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,-0.51257,-0.342359,-0.052705,-0.316228,-0.052705,-1.583939,-0.204565,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,0.357590,-0.037242,-0.083507,-0.037242,-0.052705,-0.204565,-0.091542,-0.16413,-0.037242
1,US_2,0.370290,0.282627,-0.324387,0.318811,-0.840874,-0.837744,-0.838700,-0.740120,-0.200922,-1.070885,0.960805,-0.198749,0.203344,0.332533,0.322539,-0.646890,0.160857,-1.217596,0.513626,-0.927388,0.182332,-0.416530,0.608299,0.705937,-0.323952,-0.697765,-0.663958,-0.781950,-0.685520,-0.697979,-0.791224,-0.730912,-0.830814,-0.734399,-0.831915,-0.806103,-0.915471,-0.260409,-0.153546,-0.753847,-0.575009,-0.555061,-0.780717,0.813537,0.471341,0.245113,-0.237511,-0.150438,-0.258243,-0.690788,-0.487264,-0.252705,-0.275917,-0.455194,0.220578,-0.695775,-1.361355,-0.092199,-0.302768,-0.193272,0.123156,-0.361079,-0.742411,-0.199686,-0.300922,0.655344,-0.36044,-0.734650,0.908283,-0.727266,0.045041,-0.772241,-0.312574,-0.791409,0.228520,-0.673271,0.606634,0.257682,-0.402170,-0.123426,0.378935,-0.236918,0.014104,0.042263,-0.821400,0.594253,0.172109,0.004146,-0.934719,-0.446542,-0.713491,-0.224429,0.823311,-0.487655,-0.179828,0.219573,-0.139407,-0.513885,0.654352,-0.495195,-0.696653,0.504157,0.034921,-0.393,-0.446513,-0.613431,0.241193,0.123187,0.298974,0.164340,0.642889,-0.514784,-0.645101,-0.356435,-0.308046,-0.465984,-0.535647,-0.717804,-0.646230,-0.037242,-0.289541,-0.278433,-0.145659,-0.016211,-0.164399,1.260085,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,-0.098784,-0.261324,-0.215353,-0.051678,1.432247,1.722508,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,-0.51257,-0.342359,-0.052705,-0.316228,-0.052705,-0.686995,-0.204565,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,0.357590,-0.037242,-0.083507,-0.037242,-0.052705,-0.204565,-0.091542,-0.16413,-0.037242
2,US_3,0.366447,0.255857,-0.483658,-0.364802,-0.631645,-0.556569,-0.629139,-0.637947,-0.201909,-1.070885,0.960805,-0.199867,0.171757,0.365089,0.402430,-0.646890,0.160865,-1.249951,0.529808,-0.965000,0.185125,-0.416498,1.440770,0.497265,0.570095,-0.422385,-0.613488,-0.732374,-0.397038,-0.868573,-0.955352,-1.153696,-1.227779,-1.256683,-1.332690,-1.296645,-1.397467,-1.690448,-0.150834,-0.667184,-0.455833,-0.222323,-0.600051,0.745171,0.361010,0.245113,-0.237511,-0.150438,-0.258243,-0.690788,0.863861,-0.252705,-0.275917,-0.455194,-1.373230,1.166896,-0.309139,-0.092199,-0.302768,-0.193272,1.419923,3.005662,-1.502066,-1.611664,1.291933,0.655344,-0.36044,-0.734650,0.908283,-0.727266,-1.184302,1.233551,-0.312574,-1.264763,1.275358,-0.673271,0.606634,1.165821,-0.402170,-0.123426,0.378935,-0.236918,0.054692,-0.796676,-0.821400,1.271180,1.403276,-0.568357,-0.934719,-0.333341,-0.713491,0.206342,0.809274,-0.530972,0.571471,0.094069,0.531189,-1.115673,0.551571,-1.613182,-0.418300,0.504157,0.968725,-0.393,-1.466710,-1.655543,0.241193,1.867123,1.719101,1.273251,0.642889,0.160987,0.074780,-0.356435,-1.472494,-1.676202,-0.535647,-0.851319,-0.555500,-0.037242,-0.289541,-0.278433,-0.145659,2.584797,-0.164399,1.260085,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,1.559865,-0.261324,-0.215353,-0.051678,-0.640062,-0.580549,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,1.85971,-0.342359,-0.052705,-0.316228,-0.052705,-0.686995,4.888410,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,0.357590,-0.037242,-0.083507,-0.037242,-0.052705,-0.204565,-0.091542,-0.16413,-0.037242
3,US_4,0.363396,0.256350,-0.481359,-0.469590,-0.688872,-0.596636,-0.686606,-0.740120,-0.201930,-1.070885,0.960805,-0.199891,-1.428648,-0.872050,-0.356534,-0.646890,0.160863,-1.252099,0.529706,-0.965484,0.185171,-0.416458,1.460675,0.456686,0.451644,-0.485918,-0.747273,-0.863789,-0.345506,-0.849896,-0.937383,-1.259070,-1.326717,-1.398294,-1.468468,-1.374888,-1.474347,-1.804851,-0.150593,-0.687990,-0.213464,0.077968,-0.589911,-0.193009,-0.460241,0.245113,-0.237511,-0.150438,-0.258243,0.662722,-0.487264,-0.252705,-0.275917,0.048826,-0.641703,0.457309,-0.309139,-0.092199,-0.302768,-0.193272,0.146595,-0.361079,-1.502066,-1.611664,2.329913,0.655344,-0.36044,-0.734650,0.908283,-0.727266,-1.559026,0.183978,-0.312574,-1.273564,2.289132,-0.673271,0.606634,0.858748,-0.402170,-0.123426,0.378935,-0.236918,0.054692,-0.804747,-0.821400,1.271180,1.430059,-0.568357,-0.934719,-0.457100,-0.713491,0.204625,0.809023,-0.080902,-0.260102,1.566288,-0.117701,-1.112109,-0.633797,-2.085221,-0.418300,0.504157,0.034921,-0.393,-1.466710,-1.655543,0.241193,1.867123,1.719101,0.164340,0.642889,-0.514784,0.074780,-0.356435,-1.472494,-1.676202,-0.535647,-0.102844,-0.481404,-0.037242,-0.289541,-0.278433,-0.145659,1.284293,-0.164399,1.260085,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,-0.098784,-0.261324,-0.215353,-0.051678,-0.640062,-0.580549,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,-0.51257,-0.342359,-0.052705,-0.316228,-0.052705,-0.686995,-0.204565,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,2.527167,-0.037242,-0.083507,-0.037242,-0.052705,-0.204565,-0.091542,-0.16413,-0.037242
4,US_5,0.358201,0.249087,-0.513840,-0.138580,-0.898862,-0.797677,-0.896648,-0.944467,-0.193474,-1.070885,0.960805,-0.190311,0.203344,0.332533,0.322539,-0.646890,0.160865,-1.248200,0.530251,-0.947588,0.185958,-0.416506,1.270048,0.953152,0.481607,-0.697765,-0.936893,-1.050051,-0.685520,-1.218476,-1.291990,-1.405083,-1.463814,-1.436607,-1.505204,-1.486580,-1.584093,-1.690448,-0.153546,-1.013359,-0.928842,-0.914902,-0.900445,-0.193009,0.471341,0.245113,-0.237511,-0.150438,-0.258243,0.580659,-0.487264,1.145586,-0.275917,-0.455194,-1.007467,-0.018884,-0.454536,-0.092199,-0.302768,-0.193272,1.483883,3.005662,-1.490724,-2.297985,1.269117,0.655344,-0.36044,-0.734650,0.908283,-0.727266,-1.707565,1.261260,-0.312574,-1.273564,1.264337,-0.673271,0.606634,1.165821,-0.402170,-0.123426,0.378935,0.294776,-1.340548,-0.804747,-0.425119,1.119934,1.078572,-0.558927,-0.934719,-0.349336,-0.713491,-0.786942,1.509353,-0.117188,-0.282391,0.555769,-0.340437,-1.171548,0.490955,-1.879370,-0.696653,0.504157,0.968725,-0.393,-1.466710,-1.655543,0.241193,1.867123,1.719101,1.273251,0.642889,0.160987,-0.645101,-0.356435,-1.472494,-1.676202,-0.535647,-0.899407,-0.905962,-0.037242,-0.289541,-0.278433,-0.145659,2.584797,-0.164399,1.260085,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,1.559865,-0.261324,-0.215353,-0.051678,-0.640062,-0.580549,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,1.85971,-0.342359,-0.052705,-0.316228,-0.052705,-0.686995,-0.204565,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,0.357590,-0.037242,-0.083507,-0.037242,-0.052705,-0.204565,-0.091542,-0.16413,-0.037242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,paper_34,0.412000,0.282268,-0.641477,-0.021133,-0.306915,-0.295327,-0.304065,-0.229253,-0.200687,0.648214,-0.646127,-0.198483,-1.418645,-1.111746,-0.754989,-0.646747,0.071399,0.355821,0.407910,-0.109332,0.173694,-0.435484,-0.028260,-0.343267,0.130336,-0.291194,-0.218157,-0.344045,-0.091014,0.042768,-0.078558,0.119728,-0.032122,0.314796,0.174071,0.456276,0.324916,-0.012536,-0.142133,-0.445877,-0.585222,-0.835802,-0.259654,-1.242000,0.480888,0.245113,-0.237511,-0.150438,-0.258243,0.639891,0.702949,-0.252705,-0.275917,-0.455194,0.964525,-1.263573,-0.479222,-0.210659,-0.302768,-0.193272,0.123156,-0.361079,0.009690,-0.251928,0.207693,-0.197389,-0.36044,0.240688,-0.663397,-0.727266,-0.456470,-0.772241,0.829282,0.074835,0.228520,-0.673271,0.606634,-1.002376,-0.402170,-0.265067,0.378935,0.214280,-0.056111,0.113930,0.810034,0.594253,-0.666525,-0.575269,-0.934719,0.474419,-0.713491,-0.130687,-0.959483,-0.466569,-0.585922,-0.050176,0.677808,0.062092,-0.633797,-0.086468,-0.139948,-0.940293,-0.898884,-0.393,0.573684,0.428680,0.241193,0.123187,0.298974,-0.944571,-0.991498,-0.514784,-0.645101,-0.356435,0.856401,0.744233,0.880973,0.377130,-0.256438,-0.037242,-0.289541,-0.278433,-0.145659,-0.016211,-0.164399,-0.793597,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,-0.098784,-0.261324,-0.215353,-0.051678,-0.640062,-0.580549,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,-0.51257,1.038552,-0.052705,-0.316228,-0.052705,0.209950,-0.204565,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,0.357590,-0.037242,-0.083507,-0.037242,-0.052705,-0.204565,-0.091542,-0.16413,-0.037242
718,paper_35,0.415567,-0.134391,-0.846065,-0.202761,0.035799,0.086038,0.039311,0.077267,-0.200687,0.648214,-0.646127,-0.198483,-1.789715,-1.633947,-1.374656,-0.644994,0.071395,0.358620,0.407856,-0.034801,0.173687,-0.427111,-0.028260,-0.220271,0.399656,0.047720,-0.080667,-0.208990,0.155650,0.121977,-0.002352,0.194054,0.037664,0.394111,0.250119,0.511893,0.379564,-0.146006,-0.133985,-0.126361,-0.401449,-0.627865,-0.064031,-1.242000,-0.339351,4.180846,-0.237511,-0.150438,-0.258243,0.639891,1.893163,-0.252705,-0.275917,-0.455194,0.274006,-0.757388,-0.479222,0.515607,-0.302768,-0.193272,0.123156,-0.361079,0.009690,-0.251928,0.185909,-0.197389,-0.36044,1.216026,-0.663397,-0.727266,-0.456470,-0.772241,1.971139,0.074835,-0.283877,-0.673271,0.606634,-1.002376,1.034654,-0.265067,2.717317,0.627318,-0.651945,0.950382,-0.425119,1.256927,-1.505159,-0.575269,-0.934719,1.425648,-0.713491,-0.205823,-1.145568,-0.846163,-2.373451,-0.753307,0.566744,-0.040324,-0.633797,-0.086468,0.138405,-0.940293,-0.898884,-0.393,0.573684,0.428680,0.241193,0.123187,0.298974,-0.944571,-0.991498,0.160987,-0.645101,-0.356435,0.856401,0.744233,0.880973,0.534519,-0.259462,-0.037242,-0.289541,-0.278433,-0.145659,-0.016211,-0.164399,-0.793597,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,-0.098784,-0.261324,-0.215353,-0.051678,-0.640062,-0.580549,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,-0.51257,1.038552,-0.052705,-0.316228,-0.052705,1.106894,-0.204565,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,0.357590,-0.037242,-0.083507,-0.037242,-0.052705,-0.204565,-0.091542,-0.16413,-0.037242
719,paper_36,0.482881,0.233261,-0.874320,-0.405659,0.150709,0.186889,0.154009,0.179440,-0.200687,0.648214,-0.646127,-0.198483,-0.285501,-0.408123,-0.555310,-0.646744,0.071216,0.358538,0.407465,-0.009968,0.173688,-0.366278,-0.028260,-0.124154,0.852412,0.323101,0.239206,0.105217,0.466059,0.370103,0.236368,0.356388,0.190084,0.572115,0.420792,0.697584,0.562020,-0.851492,-0.117859,0.090696,-0.114644,-0.662905,0.291172,-1.242000,0.480888,0.245113,-0.237511,-0.150438,-0.258243,0.639891,0.702949,-0.252705,2.451447,0.036701,0.274006,-0.757388,0.406816,-0.210659,-0.302768,-0.193272,0.123156,-0.361079,0.009690,-0.251928,0.163721,1.633368,-0.36044,0.240688,-0.663397,-0.727266,-0.456470,-0.772241,3.252231,0.450123,-0.283877,-0.673271,0.606634,-1.002376,0.316242,-0.265067,0.378935,0.193858,0.520743,0.575119,-0.425119,1.256927,-1.085842,-0.088529,-0.067605,0.520974,-0.713491,-0.134837,-1.010810,-0.549036,1.514278,-0.507364,2.614234,0.016357,-0.633797,-0.382266,0.416758,-0.940293,-0.898884,-0.393,0.573684,0.428680,0.241193,0.123187,0.298974,-0.944571,-0.991498,-0.514784,-0.645101,-0.356435,0.856401,0.744233,0.880973,0.355971,0.314944,-0.037242,-0.289541,-0.278433,-0.145659,-0.016211,-0.164399,-0.793597,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,-0.098784,-0.261324,-0.215353,-0.051678,-0.640062,-0.580549,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,-0.51257,1.038552,-0.052705,-0.316228,-0.052705,0.209950,-0.204565,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,0.357590,-0.037242,-0.083507,-0.037242,-0.052705,4.888410,-0.091542,-0.16413,-0.037242
720,paper_37,0.462809,0.246761,-0.836028,-0.473964,0.311341,0.356153,0.306039,0.077267,-0.200687,-1.070884,0.960804,-0.198483,0.493790,0.131239,0.069227,1.100147,0.071395,0.357978,0.407492,0.278894,0.173698,0.425537,-0.028260,-0.173040,0.413790,0.047720,-0.048995,0.161731,0.145936,0.142300,0.275220,0.246405,0.389054,0.372909,0.493315,0.558158,0.704271,0.292539,-0.135418,-0.047174,-0.297154,-0.392865,0.253736,-0.193009,0.471341,0.245113,-0.237511,-0.150438,-0.258243,0.639891,-0.487264,-0.252705,-0.275917,-0.499378,1.005533,-0.285373,-0.479222,-0.092199,1.032990,-0.193272,0.123156,-0.361079,0.009690,-0.251928,-0.392178,0.655344,-0.36044,-0.734650,0.908283,1.230756,-0.037486,-0.772241,-0.312574,0.074835,-0.283877,1.289903,0.606634,0.257682,-0.402170,-0.123426,0.378935,0.208831,1.058656,-0.219260,0.413753,1.256927,-1.085842,-0.575269,0.631338,-0.416608,1.313203,-0.129163,0.867298,-0.577751,-0.274387,-0.452078,0.658876,0.043170,-0.633797,-0.086468,0.138405,0.504157,0.034921,-0.393,0.573684,0.428680,0.241193,0.123187,0.298974,0.164340,0.642889,0.160987,-0.645101,-0.356435,0.856401,0.744233,0.880973,0.625942,0.227224,-0.037242,-0.289541,-0.278433,-0.145659,-0.016211,-0.164399,1.260085,-0.052705,-0.321026,-0.31841,-0.087167,-0.037242,-0.098784,-0.261324,-0.215353,-0.051678,-0.640062,-0.580549,-0.091542,-0.14062,-0.508104,-0.070507,-0.309352,-0.493979,-0.51257,1.038552,-0.052705,-0.316228,-0.052705,0.209950,-0.204565,-0.037242,-0.258067,-0.145659,-0.185429,-0.211808,-0.183176,0.327661,-0.052705,0.357590,-0.037242,-0.083507,-0.037242,-0.052705,-0.204565,-0.091542,-0.16413,-0.037242


In [None]:
smi2prop(df,normalize=False)

Unnamed: 0,ID,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,MaxPartialCharge,...,fr_halogen,fr_imidazole,fr_methoxy,fr_morpholine,fr_nitrile,fr_piperdine,fr_piperzine,fr_pyridine,fr_term_acetylene,fr_unbrch_alkane
0,G12D_1,16.725593,-0.912746,0.007777,0.319044,600.645,569.397,600.246059,226.0,0.318621,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,G12D_5A,16.408797,-0.546191,0.013725,0.369319,546.050,516.818,545.210614,200.0,0.318610,...,2.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,G12D_5B,16.254417,-0.498575,0.197532,0.426624,507.013,478.789,506.199715,186.0,0.318610,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,G12D_6,16.215868,-0.551045,0.054533,0.441060,488.567,459.335,488.233602,186.0,0.318610,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,G12D_7,16.296115,-0.548440,0.055691,0.424225,502.594,471.346,502.249252,192.0,0.318610,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,G12C_20,13.794964,-1.016664,0.079151,0.358213,604.130,568.850,603.252479,224.0,0.318223,...,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
58,G12C_21,12.978753,-0.325754,0.071299,0.269545,616.166,577.862,615.272466,230.0,0.318223,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
59,G12C_22,12.777688,-0.705241,0.172589,0.334036,618.157,580.861,617.268129,230.0,0.318223,...,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
60,G12C_23,13.004729,-0.260666,0.105776,0.318010,630.193,589.873,629.288116,236.0,0.318223,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [7]:
#| export
def smi2morgan(df, # a dataframe contains ID and SMILES columns
               smi_colname = "SMILES", # set smiles columne name
               id_colname = "ID", # set ID column name
              ):
    "Like `smi2prop`, get 2048 morgan feature (0/1) given a dataframe that contains ID&smiles"
    mols = [Chem.MolFromSmiles(smi) for smi in df[smi_colname]]
    morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in mols]
    fp_df = pd.DataFrame(np.array(morgan_fps), index=df[id_colname])
    colnames = [f'morgan_{i}' for i in fp_df.columns]
    fp_df.columns = colnames
    fp_df = fp_df.reset_index()
    return fp_df

In [11]:
show_doc(smi2morgan)

---

[source](https://github.com/sky1ove/tools/blob/main/tools/feature.py#L42){target="_blank" style="float:right; font-size:smaller"}

### smi2morgan

>      smi2morgan (df, smi_colname='SMILES', id_colname='ID')

Like `smi2prop`, get 2048 morgan feature (0/1) given a dataframe that contains ID&smiles

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df |  |  | a dataframe contains ID and SMILES columns |
| smi_colname | str | SMILES | set smiles columne name |
| id_colname | str | ID | set ID column name |

In [None]:
smi2morgan(df)

Unnamed: 0,ID,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,G12D_1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,G12D_5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,G12D_5B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,G12D_6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,G12D_7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,G12C_20,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
58,G12C_21,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
59,G12C_22,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
60,G12C_23,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()