In [4]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")

#Reading in Molecular Properties CSV
data = pd.read_csv('Downsampled_Tox21_Full')
#data = data.astype(float, errors = 'ignore')
data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,Activity Summary,PUBCHEM_CID,...,InChI,XLogP,ExactMass,TPSA,HBondDonorCount,HBondAcceptorCount,RotatableBondCount,HeavyAtomCount,Complexity,target
0,-1.094361,-1.010855,-0.418947,0.716336,-0.229695,0.171622,0.396022,-0.003879,active antagonist,1807.0,...,InChI=1S/C4H6BrNO4/c5-4(6(7)8)1-9-3-10-2-4/h1-3H2,0.3,210.94802,64.3,0.0,4.0,0.0,10.0,139.0,1
1,-0.967952,0.059745,-0.732712,-0.256905,0.329542,-0.102167,0.539707,-0.125613,active antagonist,2453.0,...,InChI=1S/C9H5Br2NO/c10-6-4-7(11)9(13)8-5(6)2-1...,3.2,302.87174,33.1,1.0,2.0,0.0,13.0,191.0,1
2,1.307136,0.361127,-0.924566,0.668585,0.188939,0.031031,0.035399,0.041305,active antagonist,2662.0,...,InChI=1S/C17H14F3N3O2S/c1-11-2-4-12(5-3-11)15-...,3.4,381.075882,86.4,1.0,7.0,3.0,26.0,577.0,1
3,-1.197937,0.02864,-0.64881,-0.209172,0.471945,0.045566,0.092518,-0.00722,active antagonist,2722.0,...,InChI=1S/C9H5Cl2NO/c10-6-4-7(11)9(13)8-5(6)2-1...,3.5,212.974819,33.1,1.0,2.0,0.0,13.0,191.0,1
4,-0.311095,1.570631,-0.634133,-0.362545,-0.218601,-0.260847,0.100778,0.226042,active antagonist,2812.0,...,InChI=1S/C22H17ClN2/c23-21-14-8-7-13-20(21)22(...,5.0,344.108026,17.8,0.0,1.0,4.0,25.0,396.0,1


# MACCS Keys

In [5]:
#Making list of isomericSMILES identifiers
iSMILES = data['IsomericSMILES']
iSMILES[0]

'C1C(COCO1)([N+](=O)[O-])Br'

In [6]:
#Making list of molecules from iSMILES
i = 0
ms = []
while (i < len(iSMILES)):
    molecule = Chem.MolFromSmiles(iSMILES[i])
    ms.append(molecule)
    i = i + 1

In [7]:
#Getting MACCS fingerprints list
#Turning fingerprints to bit strings
maccs_fps = [MACCSkeys.GenMACCSKeys(x).ToBitString()[1:] for x in ms ]

# Topological Fingerprint (Daylight Analogue)

In [8]:
from rdkit.Chem import rdmolops

#getting list of topological fingerprints
top_fps = [rdmolops.RDKFingerprint(x, fpSize=2048, minPath=1, maxPath=7).ToBitString() for x in ms]

# Morgan Fingerprint (ECFP)

In [9]:
from rdkit.Chem import AllChem

#getting morgan ecfp fingerprint
ecfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024).ToBitString() for x in ms]

# Morgan Fingerprint (FCFP)

In [10]:
fcfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024,useFeatures=True).ToBitString() for x in ms]

# PubChem FP Decoding

In [11]:
#Getting CID list
CID_list = data['PUBCHEM_CID']

#Making them integers (removing decimals)
CID_list = [int(i) for i in CID_list]

list1 = CID_list[:600]
list2 = CID_list[600:]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')


In [12]:
#Getting the Pubchem Fingerprints for each CID

#opening and reading the 1URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/Fingerprint2D/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/Fingerprint2D/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')

pub_fp1 = soup1.get_text()
pub_fp2 = soup2.get_text()

#total pub_fp string
pub_fp = pub_fp1 + pub_fp2 

#pub_fp string to pub_fp list
pub_fp = pub_fp.split()

In [13]:
#Decoding Fingerprints
from base64 import b64decode

def PCFP_BitString(pcfp_base64) :

    pcfp_bitstring = "".join( ["{:08b}".format(x) for x in b64decode( pcfp_base64 )] )[32:913]
    return pcfp_bitstring

i = 0
pub_fp_decoded = []
while (i < len(pub_fp)):
    fp = PCFP_BitString(pub_fp[i])
    pub_fp_decoded.append(fp)
    i = i + 1

# FinalDFTox21

In [14]:
fps_df = pd.DataFrame()
fps_df['Name'] = data['PUBCHEM_CID']
fps_df['MACCS'] = maccs_fps
fps_df['Topological'] = top_fps
fps_df['Morgan ECFP'] = ecfp_fps
fps_df['Morgan FCFP'] = fcfp_fps
fps_df['Pubchem FP'] = pub_fp_decoded
fps_df['Activity'] = data['target']
fps_df.head()

Unnamed: 0,Name,MACCS,Topological,Morgan ECFP,Morgan FCFP,Pubchem FP,Activity
0,1807.0,0000000000000000000000010001000000000000000001...,0000010000000000000000000000000010001000000000...,0000000000000000000000000000000100001000000000...,1010000010000000100000000010000000000000000000...,1000000001100010001110000000000000000000000100...,1
1,2453.0,0000000000000000000000000000000000000000000001...,1000010000010000000001000100100000100000000010...,0000000000000000000100000000000000000000000000...,0001101010000000000000000000000000000001000000...,1000000001110010001000000000000000000000000110...,1
2,2662.0,0000000000000000000000000000000110000000010000...,1111010010100000101110100011001101001001100100...,0000100000000000000000000000000001000000000000...,1110101011000000000000000000000000000000000000...,1100000001111011001100011000000001000000000000...,1
3,2722.0,0000000000000000000000000000000000000000000000...,1000010010010000100010000110000000100000010010...,0000000000000000000100000000000000000000000000...,0001101010000000000000000000000000000001000000...,1000000001110010001000000000000000000110000000...,1
4,2812.0,0000000000000000000000000000000000000000000000...,0001011100000101101010001001010110110000000000...,0100000010000000000000100000000000000000000000...,1000101010001100001000000000000000000000000000...,1110000001111011000000000000000000000100000000...,1


In [15]:
#Writing to csv
fps_df.to_csv('Fingerprints Tox21', index = False)


# Formatting - MACCS

In [270]:
#making list of maccs1, maccs2, etc.
i = 0
maccs_names = []
while (i < len(maccs_fps[0])):
    string = "MACCS" + str(i + 1)
    maccs_names.append(string)
    i = i + 1
    
#turning CID's into "CID". ex 6 would be "6"
i = 0
CID_str = []
while (i < len(fps_df.index)):
    string = "" + str(fps_df.index[i]) + ""
    CID_str.append(string)
    i = i + 1
    
CID_str

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

In [271]:
#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = [int(a) for a in str(maccs_fps[0])]
maccs_df = pd.DataFrame(first_row).T
x = 1
while (x < len(maccs_fps)):
    bit_row1 = [int(y) for y in str(maccs_fps[x])]
    row1df = pd.DataFrame(bit_row1).T
    maccs_df = maccs_df.append(row1df)
    x = x + 1


In [272]:
#Reindexing df, CID as row headers and MACCS1, MACCS2, etc as column headers
maccs_df.index = fps_df['Name']
maccs_df.columns = [maccs_names]

In [273]:
#Writing to csv
maccs_df.to_csv('maccs_fps.csv', index = False)

In [265]:
maccs_df

Unnamed: 0_level_0,MACCS1,MACCS2,MACCS3,MACCS4,MACCS5,MACCS6,MACCS7,MACCS8,MACCS9,MACCS10,...,MACCS157,MACCS158,MACCS159,MACCS160,MACCS161,MACCS162,MACCS163,MACCS164,MACCS165,MACCS166
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1807.0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,0,1,1,1,0
2453.0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,1,1,1,0
2662.0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2722.0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,1,1,1,0
2812.0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,0,1,0
3080.0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3371.0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
3728.0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,1,1,1,0
4122.0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4211.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0


# Formatting - Topological Fingerprint 

In [274]:
#making list of names
i = 0
top_names = []
while (i < len(top_fps[0])):
    string = "TOP" + str(i + 1)
    top_names.append(string)
    i = i + 1

In [275]:
#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = [int(a) for a in str(top_fps[0])]
top_df = pd.DataFrame(first_row).T
x = 1
while (x < len(maccs_fps)):
    bit_row1 = [int(y) for y in str(top_fps[x])]
    row1df = pd.DataFrame(bit_row1).T
    top_df = top_df.append(row1df)
    x = x + 1


In [276]:
#Reindexing df, CID as row headers and TOP1, TOP2, etc as column headers
top_df.index = fps_df['Name']
top_df.columns = [top_names]

In [277]:
#Writing to csv
top_df.to_csv('top_fps.csv', index = False)

# Formatting - Morgan (ecfp)

In [278]:
#making list of names
i = 0
ecfp_names = []
while (i < len(ecfp_fps[0])):
    string = "ECFP" + str(i + 1)
    ecfp_names.append(string)
    i = i + 1

In [279]:
#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = [int(a) for a in str(ecfp_fps[0])]
ecfp_df = pd.DataFrame(first_row).T
x = 1
while (x < len(ecfp_fps)):
    bit_row1 = [int(y) for y in str(ecfp_fps[x])]
    row1df = pd.DataFrame(bit_row1).T
    ecfp_df = ecfp_df.append(row1df)
    x = x + 1


In [280]:
#Reindexing df, CID as row headers and ecfs1, ecfs2, etc as column headers
ecfp_df.index = fps_df['Name']
ecfp_df.columns = [ecfp_names]

In [281]:
#Writing to csv
ecfp_df.to_csv('ecfp_fps.csv', index = False)

# Formatting - Morgan (fcfp)

In [282]:
#making list of names
i = 0
fcfp_names = []
while (i < len(fcfp_fps[0])):
    string = "FCFP" + str(i + 1)
    fcfp_names.append(string)
    i = i + 1

In [283]:
#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = [int(a) for a in str(fcfp_fps[0])]
fcfp_df = pd.DataFrame(first_row).T
x = 1
while (x < len(fcfp_fps)):
    bit_row1 = [int(y) for y in str(fcfp_fps[x])]
    row1df = pd.DataFrame(bit_row1).T
    fcfp_df = fcfp_df.append(row1df)
    x = x + 1


In [284]:
#Reindexing df, CID as row headers and ecfs1, ecfs2, etc as column headers
fcfp_df.index = fps_df['Name']
fcfp_df.columns = [fcfp_names]

In [285]:
#Writing to csv
fcfp_df.to_csv('fcfp_fps.csv', index = False)

# Formatting - PubchemFP

In [286]:
#making list of names
i = 0
pub_fp_names = []
while (i < len(pub_fp_decoded[0])):
    string = "PubFP" + str(i + 1)
    pub_fp_names.append(string)
    i = i + 1

In [287]:
#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = [int(a) for a in str(pub_fp_decoded[0])]
pub_fp_df = pd.DataFrame(first_row).T
x = 1
while (x < len(pub_fp_decoded)):
    bit_row1 = [int(y) for y in str(pub_fp_decoded[x])]
    row1df = pd.DataFrame(bit_row1).T
    pub_fp_df = pub_fp_df.append(row1df)
    x = x + 1


In [288]:
#Reindexing df, CID as row headers and ecfs1, ecfs2, etc as column headers
pub_fp_df.index = fps_df['Name']
pub_fp_df.columns = [pub_fp_names]

In [289]:
#Writing to csv
pub_fp_df.to_csv('pub_fps.csv', index = False)