+ In the paper mentioned, 12 types of Fingerprints were calculated using PaDEL software. Here we will generate 2 FPs, in particular substructure fp and pubchem fp. 

In [12]:
df = pd.read_csv("dataset_std.csv",sep="\t")

In [14]:
df.head()

Unnamed: 0,Molecule ChEMBL ID,smiles,label,pIC50
0,CHEMBL94,CNC(=O)Oc1ccc2c(c1)[C@]1(C)CCN(C)[C@@H]1N2C,0,-7.85
1,CHEMBL11805,COc1ccccc1CN(C)CCCCCC(=O)N(C)CCCCCCCCN(C)C(=O)...,0,-6.77
2,CHEMBL95,Nc1c2c(nc3ccccc13)CCCC2,0,-6.66
3,CHEMBL132377,COc1ccccc1CNCCCCCC(=O)N(C)CCCCCCCCN(C)C(=O)CCC...,0,-6.51
4,CHEMBL134488,COc1ccccc1CNCCCCCC(=O)NCCCCCCCCNC(=O)CCCCCNCc1...,0,-5.73


In [15]:
df[["smiles","label"]].to_csv("smiles_std.smi",sep="\t",index=None,header=None)

- The **PubChem System** generates a binary substructure fingerprint for chemical structures.  These fingerprints are used by PubChem for similarity neighboring and similarity searching.PubChem fingerprints are currently 881 bits in length. You can check it here (https://web.cse.ohio-state.edu/~zhang.10631/bak/drugreposition/list_fingerprints.pdf)

In [8]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['PubchemFingerprinter.xml', 'SubstructureFingerprinter.xml']

In [9]:
FP_list = [
 'PubchemFingerprinter',
 'SubstructureFingerprintCount']

In [6]:
Padel_Fp = dict(zip(FP_list,xml_files))

In [7]:
Padel_Fp

{'PubchemFingerprinter': 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount': 'SubstructureFingerprinter.xml'}

In [16]:
from padelpy import padeldescriptor
padeldescriptor(mol_dir='smiles_std.smi', 
                d_file="ache_pchem.csv", 
                descriptortypes= 'PubchemFingerprinter.xml',
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [18]:
from padelpy import padeldescriptor
padeldescriptor(mol_dir='smiles_std.smi', 
                d_file="ache_sub.csv", 
                descriptortypes= 'SubstructureFingerprinter.xml',
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [20]:
ache_pc = pd.read_csv("ache_pchem.csv")
ache_pc.head()

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
ache_pc.rename(columns={"Name":"Label"},inplace=True)

In [22]:
ache_pc.head()

Unnamed: 0,Label,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
ache_pc.to_csv("ache_pchem.csv",header=True,index=None)

In [29]:
ache_sub = pd.read_csv("ache_sub.csv")


In [30]:
ache_sub.rename(columns={"Name":"Label"},inplace=True)

In [31]:
ache_sub.head()

Unnamed: 0,Label,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,...,SubFP298,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307
0,0,1,1,0,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1


In [32]:
ache_sub.to_csv("ache_sub.csv",header=True,index=None)