# Morgan Fingerprint Feature creation for each Cytochrome P450 molecule 
(that we are using for this model)


### Imports

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from rdkit import Chem, DataStructs
from rdkit.Chem import rdchem
from rdkit.Chem import Mol
from rdkit.Chem import AllChem

import warnings
warnings.filterwarnings('ignore')

### Import Data

In [7]:
# Import data
cyto_assay = pd.read_csv('../data/train_data/cyto_assay_clean.csv')
smiles_merged = pd.read_pickle('../data/conversion_data/smiles_merged.pkl')

KEY  

* 'p450-cyp2c19' : 0
* 'p450-cyp2c9'  : 1
* 'p450-cyp2d6'  : 2
* 'p450-cyp1a2'  : 3
* 'p450-cyp3a4'  : 4 

In [None]:
smiles_merged.head()

## cyp2c19

In [None]:
# Pull out only cyp2c19 data
cyp2c19 =  smiles_merged[smiles_merged['Panel Name'] == 0]
print(cyp2c19.shape)

In [None]:
# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2c19['MORGAN_BTSTR'] = cyp2c19.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features - idea adapted from https://stackoverflow.com/questions/56458440/pandas-how-to-read-a-bitstring-into-separate-columns-when-no-delimiter-is-prese
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2c19['MORGAN_BTSTR']])

btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2c19 = cyp2c19[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2c19 = cyp2c19.merge(btstr_col, how="inner", on="index")
cyp2c19.to_pickle("data/cyp_datasets/cyp2c19.pkl")

cyp2c19.head()

### 128bit (previous was 64)

In [None]:
# Pull out only cyp2c19 data
cyp2c19_128 =  smiles_merged[smiles_merged['Panel Name'] == 0]
print(cyp2c19_128.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2c19_128['MORGAN_BTSTR'] = cyp2c19_128.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=128).ToBitString(), axis=1)

cyp2c19_128.head()

# Change bitstring to columns as features - idea adapted from https://stackoverflow.com/questions/56458440/pandas-how-to-read-a-bitstring-into-separate-columns-when-no-delimiter-is-prese
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2c19_128['MORGAN_BTSTR']])

btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2c19_128 = cyp2c19_128[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2c19_128 = cyp2c19_128.merge(btstr_col, how="inner", on="index")
cyp2c19_128.to_pickle("data/cyp_datasets/cyp2c19_128.pkl")

cyp2c19_128.head()

## 512bit

In [None]:
# Pull out only cyp2c19 data
cyp2c19_512 =  smiles_merged[smiles_merged['Panel Name'] == 0]
print(cyp2c19_512.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2c19_512['MORGAN_BTSTR'] = cyp2c19_512.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=512).ToBitString(), axis=1)

# Change bitstring to columns as features - idea adapted from https://stackoverflow.com/questions/56458440/pandas-how-to-read-a-bitstring-into-separate-columns-when-no-delimiter-is-prese
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2c19_512['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2c19_512 = cyp2c19_512[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2c19_512 = cyp2c19_512.merge(btstr_col, how="inner", on="index")
cyp2c19_512.to_pickle("data/cyp_datasets/cyp2c19_128.pkl")

cyp2c19_512.head()

## cyp2c9

In [None]:
# Pull out only cyp2c9 data
cyp2c9 =  smiles_merged[smiles_merged['Panel Name'] == 1]
print(cyp2c9.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2c9['MORGAN_BTSTR'] = cyp2c9.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2c9['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2c9 = cyp2c9[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2c9 = cyp2c9.merge(btstr_col, how="inner", on="index")
cyp2c9.to_pickle("data/cyp_datasets/cyp2c9.pkl")

## cyp1a2

In [None]:
# Pull out only cyp1a2 data
cyp1a2 =  smiles_merged[smiles_merged['Panel Name'] == 2]
print(cyp1a2.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp1a2['MORGAN_BTSTR'] = cyp1a2.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp1a2['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp1a2 = cyp1a2[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp1a2 = cyp1a2.merge(btstr_col, how="inner", on="index")
cyp1a2.to_pickle("data/cyp_datasets/cyp1a2.pkl")

## cyp2d6

In [None]:
# Pull out only cyp2d6 data
cyp2d6 =  smiles_merged[smiles_merged['Panel Name'] == 3]
print(cyp2d6.shape)

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp2d6['MORGAN_BTSTR'] = cyp2d6.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp2d6['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp2d6 = cyp2d6[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp2d6 = cyp2d6.merge(btstr_col, how="inner", on="index")
cyp2d6.to_pickle("data/cyp_datasets/cyp2d6.pkl")

## cyp3a4

In [None]:
# Pull out only cyp3a4 data
cyp3a4 =  smiles_merged[smiles_merged['Panel Name'] == 4]
print(cyp3a4.shape)

cyp3a4.head()

# Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
cyp3a4['MORGAN_BTSTR'] = cyp3a4.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

# Change bitstring to columns as features
btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in cyp3a4['MORGAN_BTSTR']])
btstr_col.reset_index(inplace=True)

# Parse down to only necessart features and save to file
cyp3a4 = cyp3a4[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed"]]
cyp3a4 = cyp3a4.merge(btstr_col, how="inner", on="index")
cyp3a4.to_pickle("data/cyp3a4.pkl")

# Function to do the above on any assay 
### (that have been cleaned and Panel Name set to int)

In [3]:
# data = dataset, num = panel name value relating to cyp
def cyp_morgan_fp(data, num):
    data =  smiles_merged[smiles_merged['Panel Name'] == num]
    print(data.shape)

    # Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
    data['MORGAN_BTSTR'] = data.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

    # Change bitstring to columns as features
    btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in data['MORGAN_BTSTR']])
    btstr_col.reset_index(inplace=True)

    # Parse down to only necessart features and save to file
    cyp_df = data[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed", "Panel Name"]]
    cyp_df_fp = cyp_df.merge(btstr_col, how="inner", on="index")
    cyp_df_fp.to_csv('../data/cyp_datasets/cyp_df_fp_{}.csv'.format(num), sep=',', index=False)
    return cyp_df_fp

In [None]:
cyp_morgan_fp(cyto_assay, 4)

# Same function
### but doesn't split panel name(CYP)

###  _ _ We can probably get rid of morgan_fp_create.ipynb _ _

In [12]:
# data = dataset, num = panel name value relating to cyp
def morgan_fp(data):
    
    data =  smiles_merged
    
    # Convert MolFromSmiles to GetMorganFingerprintAsBitVect to ToBitString to get bitstrings for SMILES data
    data['MORGAN_BTSTR'] = data.apply(lambda row: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row['SMILES']), 2, nBits=64).ToBitString(), axis=1)

    # Change bitstring to columns as features
    btstr_col = pd.DataFrame([np.fromstring(s,'u1') - ord('0') for s in data['MORGAN_BTSTR']])
    btstr_col.reset_index(inplace=True)

    # Parse down to only necessart features and save to file
    df_fp = data[["index", "PUBCHEM_ACTIVITY_OUTCOME", "Inhibition Observed", "Panel Name"]]
    fingerprint_df = df_fp.merge(btstr_col, how="inner", on="index")
    fingerprint_df.to_csv('../data/cyp_datasets/fingerprint_df.csv', index=False)
    return fingerprint_df

In [13]:
morgan_fp(cyto_assay)

Unnamed: 0,index,PUBCHEM_ACTIVITY_OUTCOME,Inhibition Observed,Panel Name,0,1,2,3,4,5,...,54,55,56,57,58,59,60,61,62,63
0,5,2,1,0,1,1,1,1,0,0,...,1,0,0,0,1,0,0,1,1,0
1,6,2,1,2,1,1,1,1,0,0,...,1,0,0,0,1,0,0,1,1,0
2,7,1,1,4,1,1,1,1,0,0,...,1,0,0,0,1,0,0,1,1,0
3,8,1,1,3,1,1,1,1,0,0,...,1,0,0,0,1,0,0,1,1,0
4,9,2,1,1,1,1,1,1,0,0,...,1,0,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85700,85705,0,0,0,1,1,0,0,1,1,...,1,1,0,0,0,1,1,1,1,1
85701,85706,0,0,2,1,1,0,0,1,1,...,1,1,0,0,0,1,1,1,1,1
85702,85707,0,0,4,1,1,0,0,1,1,...,1,1,0,0,0,1,1,1,1,1
85703,85708,0,0,3,1,1,0,0,1,1,...,1,1,0,0,0,1,1,1,1,1
