In [25]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")

#Reading in Molecular Properties CSV
data = pd.read_csv('Downsampled_Tox21_Full')
#data = data.astype(float, errors = 'ignore')
data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,Activity Summary,PUBCHEM_CID,...,InChI,XLogP,ExactMass,TPSA,HBondDonorCount,HBondAcceptorCount,RotatableBondCount,HeavyAtomCount,Complexity,target
0,-1.094361,-1.010855,-0.418947,0.716336,-0.229695,0.171622,0.396022,-0.003879,active antagonist,1807.0,...,InChI=1S/C4H6BrNO4/c5-4(6(7)8)1-9-3-10-2-4/h1-3H2,0.3,210.94802,64.3,0.0,4.0,0.0,10.0,139.0,1
1,-0.967952,0.059745,-0.732712,-0.256905,0.329542,-0.102167,0.539707,-0.125613,active antagonist,2453.0,...,InChI=1S/C9H5Br2NO/c10-6-4-7(11)9(13)8-5(6)2-1...,3.2,302.87174,33.1,1.0,2.0,0.0,13.0,191.0,1
2,1.307136,0.361127,-0.924566,0.668585,0.188939,0.031031,0.035399,0.041305,active antagonist,2662.0,...,InChI=1S/C17H14F3N3O2S/c1-11-2-4-12(5-3-11)15-...,3.4,381.075882,86.4,1.0,7.0,3.0,26.0,577.0,1
3,-1.197937,0.02864,-0.64881,-0.209172,0.471945,0.045566,0.092518,-0.00722,active antagonist,2722.0,...,InChI=1S/C9H5Cl2NO/c10-6-4-7(11)9(13)8-5(6)2-1...,3.5,212.974819,33.1,1.0,2.0,0.0,13.0,191.0,1
4,-0.311095,1.570631,-0.634133,-0.362545,-0.218601,-0.260847,0.100778,0.226042,active antagonist,2812.0,...,InChI=1S/C22H17ClN2/c23-21-14-8-7-13-20(21)22(...,5.0,344.108026,17.8,0.0,1.0,4.0,25.0,396.0,1


# MACCS Keys

In [26]:
#Making list of isomericSMILES identifiers
iSMILES = data['IsomericSMILES']
iSMILES[0]

'C1C(COCO1)([N+](=O)[O-])Br'

In [27]:
#Making list of molecules from iSMILES
i = 0
ms = []
while (i < len(iSMILES)):
    molecule = Chem.MolFromSmiles(iSMILES[i])
    ms.append(molecule)
    i = i + 1

In [28]:
#Getting MACCS fingerprints list
#Turning fingerprints to bit strings
maccs_fps = [MACCSkeys.GenMACCSKeys(x).ToBitString()[1:] for x in ms ]

# Topological Fingerprint (Daylight Analogue)

In [29]:
from rdkit.Chem import rdmolops

#getting list of topological fingerprints
top_fps = [rdmolops.RDKFingerprint(x, fpSize=2048, minPath=1, maxPath=7).ToBitString() for x in ms]

# Morgan Fingerprint (ECFP)

In [30]:
from rdkit.Chem import AllChem

#getting morgan ecfp fingerprint
ecfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024).ToBitString() for x in ms]

# Morgan Fingerprint (FCFP)

In [31]:
fcfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024,useFeatures=True).ToBitString() for x in ms]

# PubChem FP Decoding

In [50]:
#Getting CID list
CID_list = data['PUBCHEM_CID']

#Making them integers (removing decimals)
CID_list = [int(i) for i in CID_list]

list1 = CID_list[:600]
list2 = CID_list[600:]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')


In [53]:
#Getting the Pubchem Fingerprints for each CID

#opening and reading the 1URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/Fingerprint2D/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/Fingerprint2D/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')

pub_fp1 = soup1.get_text()
pub_fp2 = soup2.get_text()

#total pub_fp string
pub_fp = pub_fp1 + pub_fp2 

#pub_fp string to pub_fp list
pub_fp = pub_fp.split()

In [65]:
#Decoding Fingerprints
from base64 import b64decode

def PCFP_BitString(pcfp_base64) :

    pcfp_bitstring = "".join( ["{:08b}".format(x) for x in b64decode( pcfp_base64 )] )[32:913]
    return pcfp_bitstring

i = 0
pub_fp_decoded = []
while (i < len(pub_fp)):
    fp = PCFP_BitString(pub_fp[i])
    pub_fp_decoded.append(fp)
    i = i + 1

# FinalDF

In [72]:
fps_df = pd.DataFrame()
fps_df['Name'] = data['PUBCHEM_CID']
fps_df['MACCS'] = maccs_fps
fps_df['Topological'] = top_fps
fps_df['Morgan ECFP'] = ecfp_fps
fps_df['Morgan FCFP'] = fcfp_fps
fps_df['Pubchem FP'] = pub_fp_decoded
fps_df['Activity'] = data['target']
fps_df.head()

Unnamed: 0,Name,MACCS,Topological,Morgan ECFP,Morgan FCFP,Pubchem FP,Activity
0,1807.0,0000000000000000000000010001000000000000000001...,0000010000000000000000000000000010001000000000...,0000000000000000000000000000000100001000000000...,1010000010000000100000000010000000000000000000...,1000000001100010001110000000000000000000000100...,1
1,2453.0,0000000000000000000000000000000000000000000001...,1000010000010000000001000100100000100000000010...,0000000000000000000100000000000000000000000000...,0001101010000000000000000000000000000001000000...,1000000001110010001000000000000000000000000110...,1
2,2662.0,0000000000000000000000000000000110000000010000...,1111010010100000101110100011001101001001100100...,0000100000000000000000000000000001000000000000...,1110101011000000000000000000000000000000000000...,1100000001111011001100011000000001000000000000...,1
3,2722.0,0000000000000000000000000000000000000000000000...,1000010010010000100010000110000000100000010010...,0000000000000000000100000000000000000000000000...,0001101010000000000000000000000000000001000000...,1000000001110010001000000000000000000110000000...,1
4,2812.0,0000000000000000000000000000000000000000000000...,0001011100000101101010001001010110110000000000...,0100000010000000000000100000000000000000000000...,1000101010001100001000000000000000000000000000...,1110000001111011000000000000000000000100000000...,1


In [73]:
#Writing to csv
fps_df.to_csv('Fingerprints Tox21', index = False)
