In [10]:
proteinsequence = "AAABCDEFGHIBKLMNPQRSTVWY"  
print(proteinsequence.count("A") )
print(proteinsequence.count("AA") ) 

3
1


In [1]:
import pandas as pd
from rdkit import Chem

In [2]:
dir_DATA = "/mnt/w/Projects/Kinome/data" # read the ligand sheet created in dataset_operation.ipynb

lig_kinetics = pd.read_csv(f"{dir_DATA}/ligands_pubchem.csv", index_col=0, header=0)
lig_kinetics

Unnamed: 0,ligand_name,CAS,ligand,commercial_name,SMILES
0,3-Methyladenine,5142-23-4,3-Methyladenine,3-Methyladenine,CN1C=NC(=N)C2=C1N=CN2
1,A66,1166227-08-2,A66,A66,CC1=C(SC(=N1)NC(=O)N2CCC[C@H]2C(=O)N)C3=CSC(=N...
2,A-674563,552325-73-2,A-674563,A-674563,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OC[C@H](CC4=C...
3,A-769662,844499-71-4,A-769662,A-769662,C1=CC=C(C(=C1)C2=CC=C(C=C2)C3=CSC4=C3C(=C(C(=O...
4,AEE788 (NVP-AEE788),497839-62-0,AEE788,NVP-AEE788,CCN1CCN(CC1)CC2=CC=C(C=C2)C3=CC4=C(N3)N=CN=C4N...
...,...,...,...,...,...
265,Y-27632 2HCl,129830-38-2,Y-27632 2HCl,Y-27632 2HCl,C[C@H](C1CCC(CC1)C(=O)NC2=CC=NC=C2)N.Cl.Cl
266,YM201636,371942-69-7,YM201636,YM201636,C1COCCN1C2=NC(=NC3=C2OC4=C3C=CC=N4)C5=CC(=CC=C...
267,ZM 336372,208260-29-1,ZM 336372,ZM 336372,CC1=C(C=C(C=C1)NC(=O)C2=CC(=CC=C2)N(C)C)NC(=O)...
268,ZM-447439,331771-20-1,ZM-447439,ZM-447439,COC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=C(C=C3)NC(=O)...


In [3]:
def cansmi_getmol(row, smi_col='smiles'):
    ori_smi = row[smi_col]
    try:
        mol = Chem.MolFromSmiles(ori_smi)
        can_smi = Chem.MolToSmiles(mol)
        return(can_smi, mol)
    except:
        return(None, None)
    
def remove_parsefail(df, molcol="mol_obj"):
    mask = df[molcol].isna()
    parsefail = df.loc[mask,:]
    df_updt = df.loc[~mask, :]
    return(df_updt, parsefail)
    
lig_kinetics[['smi', 'mol_obj']] = lig_kinetics.apply(cansmi_getmol, args=('SMILES', ), axis='columns', result_type='expand')
lig_kinetics, lig_kinetics_parsefail = remove_parsefail(lig_kinetics)

In [None]:
# calculate Mordred features
from mordred import Calculator, descriptors

lig_df = lig_kinetics

mordred_calc = Calculator(descriptors, ignore_3D=True)
feat_df = mordred_calc.pandas(lig_df.mol_obj.to_list())
feat_df.index = lig_df.index

feat_df = feat_df.copy()

In [5]:
# calculate fingerprints
from rdkit.Chem import rdFingerprintGenerator

mols = lig_df.mol_obj.to_list()

fp_calculators = [("Morgan", rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)),
                  ("Morgan_count", rdFingerprintGenerator.GetMorganGenerator(radius=1, fpSize=2048)),
                  ("AtomPair", rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048)),
                  ("TopologicalTorsion", rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=2048))]

fp_dict = {}
for option, fpgen in fp_calculators:
    if "Morgan" in option: #for morgan fingerprints: record the substructures along with the fingerprints
        fp_list = []
        fps_bitinfo = []
        ao = rdFingerprintGenerator.AdditionalOutput()
        ao.AllocateBitInfoMap()
        for m in mols:
            if "count" in option:
                fp_list.append(fpgen.GetCountFingerprintAsNumPy(m, additionalOutput=ao))
            else:
                fp_list.append(fpgen.GetFingerprintAsNumPy(m, additionalOutput=ao))
            fps_bitinfo.append(ao.GetBitInfoMap())
        fp_dict[option+"_BitInfo"] = fps_bitinfo
    else:
        fp_list = [fpgen.GetFingerprintAsNumPy(mol) for mol in mols]
    fp_dict[option] = np.vstack(fp_list)

In [16]:
# (optonal here, can be done later)
# remove columns containing nan
feat_df = feat_df.apply(pd.to_numeric, errors='coerce').dropna(axis=1)

Unnamed: 0,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,8.582741,8.124951,0,0,9,10,18,11,0,0,...,6.629363,9.362632,55.477134,149.070145,8.281675,136,15,58.0,69.0,2.416667
1,20.449495,17.975646,0,0,10,10,49,26,0,0,...,7.781139,10.176830,78.066147,393.129317,8.023047,1796,38,140.0,165.0,5.472222
2,21.310663,15.946662,0,1,21,22,49,27,0,0,...,6.803505,10.110136,75.964825,358.179361,7.309783,2136,39,142.0,165.0,5.916667
3,20.510745,16.154131,0,0,21,22,38,26,0,0,...,6.861711,10.340742,75.452185,360.056863,9.475181,1708,46,142.0,173.0,5.722222
4,26.110580,17.966093,0,2,21,22,65,33,0,0,...,6.861711,10.366750,83.177122,440.268845,6.773367,3773,51,176.0,207.0,7.277778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,13.682318,11.284539,0,1,6,6,43,20,0,0,...,0.000000,9.527994,52.474918,319.121818,7.421438,3700000677,24,88.0,99.0,4.055556
266,28.219970,19.979760,0,0,25,27,56,35,0,0,...,7.065613,10.582308,86.196768,467.170588,8.342332,3815,59,194.0,233.0,7.527778
267,22.414889,17.320789,0,0,18,18,52,29,0,0,...,0.000000,10.152806,64.251724,389.173942,7.484114,2526,44,148.0,170.0,6.361111
268,29.646114,19.600446,0,1,22,23,69,38,0,0,...,0.000000,10.441880,74.654093,513.237604,7.438226,5793,58,196.0,227.0,8.527778


In [7]:
# save to file
import pickle
dir_PKL = '/mnt/w/Projects/Kinome/data/features_pkl' # path to pickle files 

with open(f"{dir_PKL}/features_kinetics.pkl", "wb") as f:
        pickle.dump([lig_df, feat_df, fp_dict], f)
        f.close()