In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")

#Reading in Molecular Properties CSV
data = pd.read_csv('Downsampled_Tox21_Full')
#data = data.astype(float, errors = 'ignore')
data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PUBCHEM_CID,Activity Outcome,...,RotatableBondCount,CanonicalSMILES,IsomericSMILES,InChI,XLogP,ExactMass,TPSA,HeavyAtomCount,Complexity,target
0,-1.403337,-0.503341,-0.387764,-0.082259,-0.278501,-0.000986,-0.08914,0.055148,1923.0,active agonist,...,0.0,C1=CC2=C(C(=C1)O)N=CC=C2,C1=CC2=C(C(=C1)O)N=CC=C2,InChI=1S/C9H7NO/c11-8-5-1-3-7-4-2-6-10-9(7)8/h...,2.0,145.052764,33.1,11.0,138.0,1
1,-0.114826,0.268539,-1.038696,0.154652,0.502283,-0.23756,0.066917,0.154261,2118.0,active antagonist,...,1.0,CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4,CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4,InChI=1S/C17H13ClN4/c1-11-20-21-16-10-19-17(12...,2.1,308.082874,43.1,22.0,434.0,1
2,2.891772,1.015351,-0.548019,1.033463,0.017861,0.398095,-0.157217,-0.126047,2176.0,active agonist,...,8.0,CCOC(=O)OC(C)OC1=C(N(S(=O)(=O)C2=CC=CC=C21)C)C...,CCOC(=O)OC(C)OC1=C(N(S(=O)(=O)C2=CC=CC=C21)C)C...,InChI=1S/C20H21N3O7S/c1-4-28-20(25)30-13(2)29-...,4.0,447.110021,133.0,31.0,808.0,1
3,-0.357005,-0.033105,-0.441273,0.566864,-0.057438,0.691294,0.277729,0.042834,2194.0,active antagonist,...,2.0,COC1=CC=C(C=C1)C2=CC(=S)SS2,COC1=CC=C(C=C1)C2=CC(=S)SS2,InChI=1S/C10H8OS3/c1-11-8-4-2-7(3-5-8)9-6-10(1...,2.8,239.973728,91.9,14.0,254.0,1
4,2.381299,0.265859,-0.401451,0.916811,0.416766,0.295284,-0.221053,-0.110032,2225.0,active antagonist,...,7.0,CC1=C(C(C(=C(N1)C)C(=O)OCC(=O)C)C2=CC=CC=C2[N+...,CC1=C(C(C(=C(N1)C)C(=O)OCC(=O)C)C2=CC=CC=C2[N+...,InChI=1S/C19H20N2O7/c1-10(22)9-28-19(24)16-12(...,2.0,388.127051,128.0,28.0,748.0,1


# MACCS Keys

In [2]:
#Making list of isomericSMILES identifiers
iSMILES = data['IsomericSMILES']
iSMILES[0]

'C1=CC2=C(C(=C1)O)N=CC=C2'

In [3]:
#Making list of molecules from iSMILES
i = 0
ms = []
while (i < len(iSMILES)):
    molecule = Chem.MolFromSmiles(iSMILES[i])
    ms.append(molecule)
    i = i + 1

In [4]:
#Getting MACCS fingerprints list
#Turning fingerprints to bit strings
maccs_fps = [MACCSkeys.GenMACCSKeys(x).ToBitString()[1:] for x in ms ]

# Topological Fingerprint (Daylight Analogue)

In [5]:
from rdkit.Chem import rdmolops

#getting list of topological fingerprints
top_fps = [rdmolops.RDKFingerprint(x, fpSize=2048, minPath=1, maxPath=7).ToBitString() for x in ms]

# Morgan Fingerprint (ECFP)

In [6]:
from rdkit.Chem import AllChem

#getting morgan ecfp fingerprint
ecfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024).ToBitString() for x in ms]

# Morgan Fingerprint (FCFP)

In [7]:
fcfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024,useFeatures=True).ToBitString() for x in ms]

# PubChem FP Decoding

In [8]:
#Getting CID list
CID_list = data['PUBCHEM_CID']

#Making them integers (removing decimals)
CID_list = [int(i) for i in CID_list]

list1 = CID_list[:600]
list2 = CID_list[600:]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')
str2 = (str(list2)[1:-1])
str2 = str2.replace(' ', '')


In [9]:
#Getting the Pubchem Fingerprints for each CID

#opening and reading the 1URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/Fingerprint2D/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')
url2 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str2 + '/property/Fingerprint2D/TXT')
html2 = urlopen(url2) 
soup2 = BeautifulSoup(html2, 'lxml')

pub_fp1 = soup1.get_text()
pub_fp2 = soup2.get_text()

#total pub_fp string
pub_fp = pub_fp1 + pub_fp2 

#pub_fp string to pub_fp list
pub_fp = pub_fp.split()

In [10]:
#Decoding Fingerprints
from base64 import b64decode

def PCFP_BitString(pcfp_base64) :

    pcfp_bitstring = "".join( ["{:08b}".format(x) for x in b64decode( pcfp_base64 )] )[32:913]
    return pcfp_bitstring

i = 0
pub_fp_decoded = []
while (i < len(pub_fp)):
    fp = PCFP_BitString(pub_fp[i])
    pub_fp_decoded.append(fp)
    i = i + 1

# FinalDFTox21

In [11]:
fps_df = pd.DataFrame()
fps_df['Name'] = data['PUBCHEM_CID']
fps_df['MACCS'] = maccs_fps
fps_df['Topological'] = top_fps
fps_df['Morgan ECFP'] = ecfp_fps
fps_df['Morgan FCFP'] = fcfp_fps
fps_df['Pubchem FP'] = pub_fp_decoded
fps_df['Activity'] = data['target']
fps_df.head()

Unnamed: 0,Name,MACCS,Topological,Morgan ECFP,Morgan FCFP,Pubchem FP,Activity
0,1923.0,0000000000000000000000000000000000000000000000...,1000010000010000000000000100000000100000000010...,0000000000000001000100000000000000000000000000...,0001101000000000000000000000000000000001000000...,1000000001110010001000000000000000000000000000...,1
1,2118.0,0000000000000000001000000000000000000100000000...,1010010000110000000110001001110000010110010101...,0000000000000001000000000000010011000000000000...,1010101010000000000000000000000000000000000000...,1100000001111011100000000000000000000100000000...,1
2,2176.0,0000000000000010000000000000000110010100000010...,1111011110111011111011011011111110011110100110...,0100000000000000000000000000001001000000000000...,1110101000100000000000000000001000010000000100...,1110000001111011001110000000000001000000000000...,1
3,2194.0,0000000000000000000000000000000000010000000000...,0110000000000000001000000101100000000000000010...,0000000000000001000000000000001001000000000000...,1010101000000000000000000000000000000000000000...,1100000001110000001000000000000001100000000000...,1
4,2225.0,0000000000000000000000010000000000000000000010...,0011100111110001010100111000110100010110010000...,1001000000000000000000000000000001001000000000...,1010100010000000100100010000000000000000000000...,1110000001111011001110000000000000000000000000...,1


In [12]:
#Writing to csv
fps_df.to_csv('Fingerprints Tox21', index = False)


# Formatting - MACCS

In [13]:
#making list of maccs1, maccs2, etc.
i = 0
maccs_names = ['Name']
while (i < len(maccs_fps[0])):
    string = "MACCS" + str(i + 1)
    maccs_names.append(string)
    i = i + 1
    
#Putting quotes around each CID 
#These are strings, should I change to int
i = 0
CID_list = fps_df['Name'].tolist()
CID_str = []
while (i < len(CID_list)):
    string = "\'" + str(CID_list[i]) + "\'"
    CID_str.append(string)
    i = i + 1
    
#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = []
first_row.append(CID_list[0])
first_row.extend([int(a) for a in str(maccs_fps[0])])
maccs_df = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(maccs_fps)):
    bit_row1 = [int(y) for y in str(maccs_fps[x])]
    row.append(CID_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    maccs_df = maccs_df.append(row1df)
    row.clear()   
    x = x + 1

#Reindexing df, MACCS1, MACCS2, etc as column headers
maccs_df.columns = [maccs_names]

#removing decimals
maccs_df = maccs_df.astype(int)

In [14]:
#Writing fingerpring information
maccs_df.to_csv('maccs_fps.csv', index = False)

In [15]:
#Adding activity_score/Making input train data
#making list of maccs1, maccs2, etc.
i = 0
maccs_names = ['Name', 'Activity']
while (i < len(maccs_fps[0])):
    string = "MACCS" + str(i + 1)
    maccs_names.append(string)
    i = i + 1

i = 0
CID_list = fps_df['Name'].tolist()
act_list = fps_df['Activity'].tolist()

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(maccs_fps[0])])
input_train_maccs = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(maccs_fps)):
    bit_row1 = [int(y) for y in str(maccs_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_train_maccs = input_train_maccs.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, MACCS1, MACCS2, etc as column headers
input_train_maccs.columns = [maccs_names]

#removing decimals
input_train_maccs = input_train_maccs.astype(int)

In [16]:
#Changing int to float (because scikit learn wants it like that)
input_train_maccs = input_train_maccs.astype(float)

In [17]:
#Writing TRAIN MACCS DATA(to TAB DELIMTED FILE?)
input_train_maccs.to_csv('input_train_maccs.csv', index = False)

# Formatting - Topological Fingerprint 

In [18]:
#making list of names
i = 0
top_names = ['Name']
while (i < len(top_fps[0])):
    string = "TOP" + str(i + 1)
    top_names.append(string)
    i = i + 1

#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = []
first_row.append(CID_list[0])
first_row.extend([int(a) for a in str(top_fps[0])])
top_df = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(top_fps)):
    bit_row1 = [int(y) for y in str(top_fps[x])]
    row.append(CID_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    top_df = top_df.append(row1df)
    row.clear()   
    x = x + 1
top_df.shape 


#Reindexing df, MACCS1, MACCS2, etc as column headers
top_df.columns = [top_names]

#removing decimals
top_df = top_df.astype(int)


In [19]:
#Writing to csv
top_df.to_csv('top_fps.csv', index = False)

In [20]:
#Adding activity_score/Making input train data
#making list of names etc.
i = 0
top_names = ['Name', 'Activity']
while (i < len(top_fps[0])):
    string = "TOP" + str(i + 1)
    top_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(top_fps[0])])
input_train_top = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(top_fps)):
    bit_row1 = [int(y) for y in str(top_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_train_top = input_train_top.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, MACCS1, MACCS2, etc as column headers
input_train_top.columns = [top_names]

#removing decimals
input_train_top = input_train_top.astype(int)

In [21]:
#Changing int to float (because scikit learn wants it like that)
input_train_top = input_train_top.astype(float)

In [22]:
#Writing TRAIN Topological DATA(to TAB DELIMTED FILE?)
input_train_top.to_csv('input_train_top.csv', index = False)

# Formatting - Morgan (ecfp)

In [23]:
#making list of names
i = 0
ecfp_names = ['Name']
while (i < len(ecfp_fps[0])):
    string = "ECFP" + str(i + 1)
    ecfp_names.append(string)
    i = i + 1

#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = []
first_row.append(CID_list[0])
first_row.extend([int(a) for a in str(ecfp_fps[0])])
ecfp_df = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(ecfp_fps)):
    bit_row1 = [int(y) for y in str(ecfp_fps[x])]
    row.append(CID_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    ecfp_df = ecfp_df.append(row1df)
    row.clear()   
    x = x + 1
ecfp_df.shape 


#Reindexing df, MACCS1, MACCS2, etc as column headers
ecfp_df.columns = [ecfp_names]

#removing decimals
ecfp_df = ecfp_df.astype(int)


In [24]:
#Writing to csv
ecfp_df.to_csv('ecfp_fps.csv', index = False)

In [25]:
#Adding activity_score/Making input train data
#making list of names etc.
i = 0
ecfp_names = ['Name', 'Activity']
while (i < len(ecfp_fps[0])):
    string = "ECFP" + str(i + 1)
    ecfp_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(ecfp_fps[0])])
input_train_ecfp = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(ecfp_fps)):
    bit_row1 = [int(y) for y in str(ecfp_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_train_ecfp = input_train_ecfp.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, namesetc as column headers
input_train_ecfp.columns = [ecfp_names]

#removing decimals
input_train_ecfp = input_train_ecfp.astype(int)

In [26]:
#Changing int to float (because scikit learn wants it like that)
input_train_ecfp = input_train_ecfp.astype(float)

In [27]:
#Writing TRAIN DATA(to TAB DELIMTED FILE?)
input_train_ecfp.to_csv('input_train_ecfp.csv', index = False)

# Formatting - Morgan (fcfp)

In [28]:
#making list of names
i = 0
fcfp_names = ['Name']
while (i < len(fcfp_fps[0])):
    string = "FCFP" + str(i + 1)
    fcfp_names.append(string)
    i = i + 1

#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = []
first_row.append(CID_list[0])
first_row.extend([int(a) for a in str(fcfp_fps[0])])
fcfp_df = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(fcfp_fps)):
    bit_row1 = [int(y) for y in str(fcfp_fps[x])]
    row.append(CID_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    fcfp_df = fcfp_df.append(row1df)
    row.clear()   
    x = x + 1
fcfp_df.shape 


#Reindexing df, MACCS1, MACCS2, etc as column headers
fcfp_df.columns = [fcfp_names]

#removing decimals
fcfp_df = fcfp_df.astype(int)


In [29]:
#Writing to csv
fcfp_df.to_csv('fcfp_fps.csv', index = False)

In [30]:
#Adding activity_score/Making input train data
#making list of names etc.
i = 0
fcfp_names = ['Name', 'Activity']
while (i < len(fcfp_fps[0])):
    string = "FCFP" + str(i + 1)
    fcfp_names.append(string)
    i = i + 1

#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(fcfp_fps[0])])
input_train_fcfp = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(fcfp_fps)):
    bit_row1 = [int(y) for y in str(fcfp_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_train_fcfp = input_train_fcfp.append(row1df)
    row.clear()   
    x = x + 1

    
#Reindexing df, namesetc as column headers
input_train_fcfp.columns = [fcfp_names]

#removing decimals
input_train_fcfp = input_train_fcfp.astype(int)

In [31]:
#Changing int to float (because scikit learn wants it like that)
input_train_fcfp = input_train_fcfp.astype(float)

In [32]:
#Writing TRAINING DATA to csv
input_train_fcfp.to_csv('input_train_fcfp.csv', index = False)

# Formatting - PubchemFP

In [33]:
#making list of names
i = 0
pub_fp_names = ['Name']
while (i < len(pub_fp_decoded[0])):
    string = "PubFP" + str(i + 1)
    pub_fp_names.append(string)
    i = i + 1

#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = []
first_row.append(CID_list[0])
first_row.extend([int(a) for a in str(pub_fp_decoded[0])])
pub_fp_df = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(pub_fp_decoded)):
    bit_row1 = [int(y) for y in str(pub_fp_decoded[x])]
    row.append(CID_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    pub_fp_df = pub_fp_df.append(row1df)
    row.clear()   
    x = x + 1
pub_fp_df.shape 


#Reindexing df, MACCS1, MACCS2, etc as column headers
pub_fp_df.columns = [pub_fp_names]

#removing decimals
pub_fp_df = pub_fp_df.astype(int)

In [34]:
#Writing to csv
pub_fp_df.to_csv('pub_fps.csv', index = False)

In [35]:
#Adding activity_score/Making input train data
#making list of names etc.
i = 0
pub_fp_names = ['Name', 'Activity']
while (i < len(pub_fp_decoded[0])):
    string = "PubFP" + str(i + 1)
    pub_fp_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(pub_fp_decoded[0])])
input_train_pub = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(pub_fp_decoded)):
    bit_row1 = [int(y) for y in str(pub_fp_decoded[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_train_pub = input_train_pub.append(row1df)
    row.clear()   
    x = x + 1

    
#Reindexing df, namesetc as column headers
input_train_pub.columns = [pub_fp_names]

#removing decimals
input_train_pub = input_train_pub.astype(int)

In [36]:
#Changing int to float (because scikit learn wants it like that)
input_train_pub = input_train_pub.astype(float)

In [37]:
#Writing TRAINING DATA to csv
input_train_pub.to_csv('input_train_pub.csv', index = False)

# Repeating Process for Test Set

In [41]:

data = pd.read_csv('Test_Tox21')
#data = data.astype(float, errors = 'ignore')
data.head()

# MACCS Keys

#Making list of isomericSMILES identifiers
iSMILES = data['IsomericSMILES']
iSMILES[0]

#Making list of molecules from iSMILES
i = 0
ms = []
while (i < len(iSMILES)):
    molecule = Chem.MolFromSmiles(iSMILES[i])
    ms.append(molecule)
    i = i + 1

#Getting MACCS fingerprints list
#Turning fingerprints to bit strings
maccs_fps = [MACCSkeys.GenMACCSKeys(x).ToBitString()[1:] for x in ms ]

# Topological Fingerprint (Daylight Analogue)

from rdkit.Chem import rdmolops

#getting list of topological fingerprints
top_fps = [rdmolops.RDKFingerprint(x, fpSize=2048, minPath=1, maxPath=7).ToBitString() for x in ms]

# Morgan Fingerprint (ECFP)

from rdkit.Chem import AllChem

#getting morgan ecfp fingerprint
ecfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024).ToBitString() for x in ms]

# Morgan Fingerprint (FCFP)

fcfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024,useFeatures=True).ToBitString() for x in ms]

# PubChem FP Decoding

#Getting CID list
CID_list = data['PUBCHEM_CID']

#Making them integers (removing decimals)
CID_list = [int(i) for i in CID_list]

list1 = CID_list[:600]

#Changing list to str, dropping start/end brackets, removing spaces
str1 = (str(list1)[1:-1])
str1 = str1.replace(' ', '')


#Getting the Pubchem Fingerprints for each CID

#opening and reading the 1URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str1 + '/property/Fingerprint2D/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')

pub_fp = soup1.get_text()

#pub_fp string to pub_fp list
pub_fp = pub_fp.split()

#Decoding Fingerprints
from base64 import b64decode

def PCFP_BitString(pcfp_base64) :

    pcfp_bitstring = "".join( ["{:08b}".format(x) for x in b64decode( pcfp_base64 )] )[32:913]
    return pcfp_bitstring

i = 0
pub_fp_decoded = []
while (i < len(pub_fp)):
    fp = PCFP_BitString(pub_fp[i])
    pub_fp_decoded.append(fp)
    i = i + 1

# FinalDFTox21

fps_df = pd.DataFrame()
fps_df['Name'] = data['PUBCHEM_CID']
fps_df['MACCS'] = maccs_fps
fps_df['Topological'] = top_fps
fps_df['Morgan ECFP'] = ecfp_fps
fps_df['Morgan FCFP'] = fcfp_fps
fps_df['Pubchem FP'] = pub_fp_decoded
fps_df['Activity'] = data['target']
fps_df.head()

#Writing to csv
fps_df.to_csv('Fingerprints Tox21 Test', index = False)




# Formatting - MACCS

#Adding activity_score/Making input train data
#making list of maccs1, maccs2, etc.
i = 0
maccs_names = ['Name', 'Activity']
while (i < len(maccs_fps[0])):
    string = "MACCS" + str(i + 1)
    maccs_names.append(string)
    i = i + 1

i = 0
CID_list = fps_df['Name'].tolist()
act_list = fps_df['Activity'].tolist()

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(maccs_fps[0])])
input_test_maccs = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(maccs_fps)):
    bit_row1 = [int(y) for y in str(maccs_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_test_maccs = input_test_maccs.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, MACCS1, MACCS2, etc as column headers
input_test_maccs.columns = [maccs_names]

#removing decimals
input_test_maccs = input_train_maccs.astype(int)

#Changing int to float (because scikit learn wants it like that)
input_test_maccs = input_test_maccs.astype(float)

#Writing test MACCS DATA
input_train_maccs.to_csv('input_test_maccs.csv', index = False)








# Formatting - Topological Fingerprint 


#Adding activity_score/Making input train data
#making list of names etc.
i = 0
top_names = ['Name', 'Activity']
while (i < len(top_fps[0])):
    string = "TOP" + str(i + 1)
    top_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(top_fps[0])])
input_test_top = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(top_fps)):
    bit_row1 = [int(y) for y in str(top_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_test_top = input_test_top.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, MACCS1, MACCS2, etc as column headers
input_test_top.columns = [top_names]

#removing decimals
input_test_top = input_test_top.astype(int)

#Changing int to float (because scikit learn wants it like that)
input_test_top = input_test_top.astype(float)

#Writing test Topological DATA
input_test_top.to_csv('input_test_top.csv', index = False)







# Formatting - Morgan (ecfp)

#Adding activity_score/Making input train data
#making list of names etc.
i = 0
ecfp_names = ['Name', 'Activity']
while (i < len(ecfp_fps[0])):
    string = "ECFP" + str(i + 1)
    ecfp_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(ecfp_fps[0])])
input_test_ecfp = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(ecfp_fps)):
    bit_row1 = [int(y) for y in str(ecfp_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_test_ecfp = input_test_ecfp.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, namesetc as column headers
input_test_ecfp.columns = [ecfp_names]

#removing decimals
input_test_ecfp = input_test_ecfp.astype(int)

#Changing int to float (because scikit learn wants it like that)
input_test_ecfp = input_test_ecfp.astype(float)

#Writing Test DATA
input_test_ecfp.to_csv('input_test_ecfp.csv', index = False)





# Formatting - Morgan (fcfp)

#Adding activity_score/Making input train data
#making list of names etc.
i = 0
fcfp_names = ['Name', 'Activity']
while (i < len(fcfp_fps[0])):
    string = "FCFP" + str(i + 1)
    fcfp_names.append(string)
    i = i + 1

#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(fcfp_fps[0])])
input_test_fcfp = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(fcfp_fps)):
    bit_row1 = [int(y) for y in str(fcfp_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_test_fcfp = input_test_fcfp.append(row1df)
    row.clear()   
    x = x + 1

    
#Reindexing df, namesetc as column headers
input_test_fcfp.columns = [fcfp_names]

#removing decimals
input_test_fcfp = input_test_fcfp.astype(int)

#Changing int to float (because scikit learn wants it like that)
input_test_fcfp = input_test_fcfp.astype(float)

#Writing test DATA to csv
input_test_fcfp.to_csv('input_test_fcfp.csv', index = False)




# Formatting - PubchemFP

#Adding activity_score/Making input train data
#making list of names etc.
i = 0
pub_fp_names = ['Name', 'Activity']
while (i < len(pub_fp_decoded[0])):
    string = "PubFP" + str(i + 1)
    pub_fp_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(pub_fp_decoded[0])])
input_test_pub = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(pub_fp_decoded)):
    bit_row1 = [int(y) for y in str(pub_fp_decoded[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_test_pub = input_test_pub.append(row1df)
    row.clear()   
    x = x + 1

    
#Reindexing df, namesetc as column headers
input_test_pub.columns = [pub_fp_names]

#removing decimals
input_test_pub = input_test_pub.astype(int)

#Changing int to float (because scikit learn wants it like that)
input_test_pub = input_test_pub.astype(float)

#Writing test DATA to csv
input_test_pub.to_csv('input_test_pub.csv', index = False)

