# Morgan Fingerprint Feature creation for each Cytochrome P450 molecule 
(that we are using for this model)


### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from rdkit import Chem, DataStructs
from rdkit.Chem import rdchem
from rdkit.Chem import Mol
from rdkit.Chem import AllChem

import warnings
warnings.filterwarnings('ignore')

### Import Data

In [2]:
# Import data and remove unnecessary header rows
cyto_assay = pd.read_csv('data/train_data/cyto_assay_clean.csv', skipinitialspace=True, header=[0,4])
smiles_merged = pd.read_pickle('data/conversion_data/smiles_merged.pkl')

KEY  

* 'p450-cyp2c19' : 0
* 'p450-cyp2c9'  : 1
* 'p450-cyp2d6'  : 2
* 'p450-cyp1a2'  : 3
* 'p450-cyp3a4'  : 4 

In [3]:
smiles_merged.head()

Unnamed: 0,PUBCHEM_SID,SMILES,index,PUBCHEM_RESULT_TAG,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Inhibition Observed,...,Activity at 11.43 uM,Activity at 25.56 uM,Activity at 57.14 uM,Activity at 0.00164 uM,Activity at 0.00366 uM,Activity at 0.00818 uM,Compound QC,Panel ID,Panel Name,Panel Target
0,842250,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br,5,6,644510.0,2,,,,1,...,-15.4511,,-48.2892,,,,QC'd by DPISMR,1,0,NP_000760.1
1,842250,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br,6,7,644510.0,2,,,,1,...,-28.1621,,-58.7064,,,,QC'd by DPISMR,2,2,NP_001020332.1
2,842250,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br,7,8,644510.0,1,,,,1,...,-82.0127,,-97.4718,,-21.1766,,QC'd by DPISMR,3,4,NP_059488.2
3,842250,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br,8,9,644510.0,1,,,,1,...,-59.959,,-84.6738,,,,QC'd by DPISMR,4,3,NP_000752.2
4,842250,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br,9,10,644510.0,2,,,,1,...,-50.9127,,-70.5259,,,,QC'd by DPISMR,5,1,NP_000762.2


## cyp2c19

In [4]:
# Pull out only cyp2c19 data
cyp2c19 =  smiles_merged[smiles_merged['Panel Name'] == 0]
print(cyp2c19.shape)

(17142, 46)


In [5]:
# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2c19['MORGAN_BTSTR'] = cyp2c19.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features - idea adapted from https://stackoverflow.com/questions/56458440/pandas-how-to-read-a-bitstring-into-separate-columns-when-no-delimiter-is-prese
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2c19['MORGAN_BTSTR']])

btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2c19 = cyp2c19[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2c19 = cyp2c19.merge(btstr_col, how="inner", on="index")
cyp2c19.to_pickle("data/cyp_datasets/cyp2c19.pkl")

cyp2c19.head()

Unnamed: 0,index,PUBCHEM_ACTIVITY_OUTCOME,Inhibition Observed,0,1,2,3,4,5,6,...,54,55,56,57,58,59,60,61,62,63
0,5,2,1,1,1,1,0,1,1,0,...,1,1,0,0,0,0,0,1,0,0
1,10,2,1,1,0,0,0,1,1,0,...,0,0,0,1,1,1,0,0,1,0
2,15,1,1,0,0,0,0,0,0,0,...,1,1,1,0,1,0,0,1,0,0
3,20,1,1,0,0,0,0,0,1,0,...,1,0,1,1,1,0,0,1,0,1
4,25,1,1,1,1,1,1,1,0,0,...,0,1,1,0,1,1,0,1,0,1


### 128bit (previous was 64)

In [6]:
# Pull out only cyp2c19 data
cyp2c19_128 =  smiles_merged[smiles_merged['Panel Name'] == 0]
print(cyp2c19_128.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2c19_128['MORGAN_BTSTR'] = cyp2c19_128.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=128).ToBitString(), axis=1)

cyp2c19_128.head()

# Change bitstring to columns as features - idea adapted from https://stackoverflow.com/questions/56458440/pandas-how-to-read-a-bitstring-into-separate-columns-when-no-delimiter-is-prese
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2c19_128['MORGAN_BTSTR']])

btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2c19_128 = cyp2c19_128[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2c19_128 = cyp2c19_128.merge(btstr_col, how="inner", on="index")
cyp2c19_128.to_pickle("data/cyp_datasets/cyp2c19_128.pkl")

cyp2c19_128.head()

(17142, 46)


Unnamed: 0,index,PUBCHEM_ACTIVITY_OUTCOME,Inhibition Observed,0,1,2,3,4,5,6,...,118,119,120,121,122,123,124,125,126,127
0,5,2,1,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,10,2,1,0,0,0,0,1,1,0,...,0,0,0,1,1,0,0,0,0,0
2,15,1,1,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0
3,20,1,1,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,1,0,0
4,25,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## 512bit

In [7]:
# Pull out only cyp2c19 data
cyp2c19_512 =  smiles_merged[smiles_merged['Panel Name'] == 0]
print(cyp2c19_512.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2c19_512['MORGAN_BTSTR'] = cyp2c19_512.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=512).ToBitString(), axis=1)

# Change bitstring to columns as features - idea adapted from https://stackoverflow.com/questions/56458440/pandas-how-to-read-a-bitstring-into-separate-columns-when-no-delimiter-is-prese
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2c19_512['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2c19_512 = cyp2c19_512[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2c19_512 = cyp2c19_512.merge(btstr_col, how="inner", on="index")
cyp2c19_512.to_pickle("data/cyp_datasets/cyp2c19_128.pkl")

cyp2c19_512.head()

(17142, 46)


Unnamed: 0,index,PUBCHEM_ACTIVITY_OUTCOME,Inhibition Observed,0,1,2,3,4,5,6,...,502,503,504,505,506,507,508,509,510,511
0,5,2,1,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,20,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,25,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## cyp2c9

In [8]:
# Pull out only cyp2c9 data
cyp2c9 =  smiles_merged[smiles_merged['Panel Name'] == 1]
print(cyp2c9.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2c9['MORGAN_BTSTR'] = cyp2c9.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2c9['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2c9 = cyp2c9[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2c9 = cyp2c9.merge(btstr_col, how="inner", on="index")
cyp2c9.to_pickle("data/cyp_datasets/cyp2c9.pkl")

(17142, 46)


## cyp1a2

In [9]:
# Pull out only cyp1a2 data
cyp1a2 =  smiles_merged[smiles_merged['Panel Name'] == 2]
print(cyp1a2.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp1a2['MORGAN_BTSTR'] = cyp1a2.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp1a2['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp1a2 = cyp1a2[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp1a2 = cyp1a2.merge(btstr_col, how="inner", on="index")
cyp1a2.to_pickle("data/cyp_datasets/cyp1a2.pkl")

(17142, 46)


## cyp2d6

In [10]:
# Pull out only cyp2d6 data
cyp2d6 =  smiles_merged[smiles_merged['Panel Name'] == 3]
print(cyp2d6.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2d6['MORGAN_BTSTR'] = cyp2d6.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2d6['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2d6 = cyp2d6[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2d6 = cyp2d6.merge(btstr_col, how="inner", on="index")
cyp2d6.to_pickle("data/cyp_datasets/cyp2d6.pkl")

(17142, 46)


## cyp3a4

In [12]:
# Pull out only cyp3a4 data
cyp3a4 =  smiles_merged[smiles_merged['Panel Name'] == 4]
print(cyp3a4.shape)

cyp3a4.head()

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp3a4['MORGAN_BTSTR'] = cyp3a4.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp3a4['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp3a4 = cyp3a4[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp3a4 = cyp3a4.merge(btstr_col, how="inner", on="index")
cyp3a4.to_pickle("data/cyp3a4.pkl")

(17142, 46)
