# Func

> Fill in a module description here

In [None]:
#| default_exp func

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import seaborn as sns
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
import pandas as pd
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors

In [None]:
#| export
def smi2prop(df, # df needs to have SMILES an ID columns
             smi_colname = "SMILES",
             id_colname = "ID",
             remove_duplicate=True,
            ):
    
    mols = [Chem.MolFromSmiles(smi) for smi in df[smi_colname]]
    desc_names = [desc_name[0] for desc_name in Descriptors.descList]
    desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)
    desc_values = [desc_calc.CalcDescriptors(mol) for mol in mols]
    compound = pd.DataFrame(np.stack(desc_values).T, index=desc_names,columns=df[id_colname])
    if remove_duplicate:
        compound = compound.loc[~compound.duplicated()] # remove duplicates
        compound = compound.loc[compound.std(axis=1) != 0] # remove features that are equal for all compounds
    compound = compound.T.reset_index()
    return compound

In [None]:
df = pd.read_csv('kras_smiles.csv')

In [None]:
df.shape

(62, 8)

In [None]:
smi2prop(df).head()

Unnamed: 0,ID,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,MaxPartialCharge,...,fr_halogen,fr_imidazole,fr_methoxy,fr_morpholine,fr_nitrile,fr_piperdine,fr_piperzine,fr_pyridine,fr_term_acetylene,fr_unbrch_alkane
0,G12D_1,16.725593,-0.912746,0.007777,0.319044,600.645,569.397,600.246059,226.0,0.318621,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,G12D_5A,16.408797,-0.546191,0.013725,0.369319,546.05,516.818,545.210614,200.0,0.31861,...,2.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,G12D_5B,16.254417,-0.498575,0.197532,0.426624,507.013,478.789,506.199715,186.0,0.31861,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,G12D_6,16.215868,-0.551045,0.054533,0.44106,488.567,459.335,488.233602,186.0,0.31861,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,G12D_7,16.296115,-0.54844,0.055691,0.424225,502.594,471.346,502.249252,192.0,0.31861,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
#| export
def smi2morgan(df,
               smi_colname = "SMILES",
               id_colname = "ID"
              ):
    mols = [Chem.MolFromSmiles(smi) for smi in df[smi_colname]]
    morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in mols]
    fp_df = pd.DataFrame(np.array(morgan_fps), index=df[id_colname])
    colnames = [f'morgan_{i}' for i in fp_df.columns]
    fp_df.columns = colnames
    fp_df = fp_df.reset_index()
    return fp_df

In [None]:
smi2morgan(df)

Unnamed: 0,ID,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,G12D_1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,G12D_5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,G12D_5B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,G12D_6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,G12D_7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,G12C_20,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
58,G12C_21,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
59,G12C_22,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
60,G12C_23,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()