# Morgan Fingerprint Feature Creation

### Convert data to useable data to be used in fingerprint creation  

The data that is imported from metrabase/chembl (same in both databases)
has a unique PUBCHEM_SID for each element.   

See README for details on conversions.

### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from rdkit import Chem, DataStructs
from rdkit.Chem import rdchem
from rdkit.Chem import Mol
from rdkit.Chem import AllChem

import warnings
warnings.filterwarnings('ignore')

### Import Data

In [2]:
# Import data and remove unnecessary header rows
cyto_assay = pd.read_csv('../data/train_data/cyto_assay_clean.csv')

### Separate PUBCHEM_SID to use in PubChem's Data Download Tool to get SMILES data

In [3]:
# Separate out SID to use in PubChem's Data Download Tool to get SMILES data
pubchemsid = cyto_assay['PUBCHEM_SID']

# Save SID files
pubchemsid.to_csv('../data/conversion_data/pubchemSID.csv')

#### Read in SIDtoSMILES data to prep for use with rdkit

In [4]:
# Read in SIDtoSMILES data to prep for use with rdkit
SID_smiles = pd.read_csv('../data/conversion_data/SIDtoSMILES.txt', delim_whitespace=True)
SID_smiles.columns = ['PUBCHEM_SID', 'SMILES']
SID_smiles.head()

Unnamed: 0,PUBCHEM_SID,SMILES
0,842250,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br
1,842319,C1=CC=NC(=C1)C=NNC(=O)C2=CC=C(C=C2)N3C=CC=C3
2,842408,CC1=C(C=C(C=C1)NC(=O)CCCC2=CC=CC=C2)[N+](=O)[O-]
3,842584,CCC[C@@H](C)C(=O)NC1=CC=CC=C1F
4,842618,C1OC2=C(O1)C=C(C=C2)C=CC(=O)NC3=NC=CC=N3


### Remove SID from SMILES data so SMILES data is useable in rdkit

In [5]:
# Remove SID from SMILES data so SMILES data is useable in rdkit
remove_SID = SID_smiles.drop(columns='PUBCHEM_SID')

# Save file as txt with no header or indexes
remove_SID.to_csv('../data/conversion_data/smiles_only.csv', index=False)

remove_SID.shape

(17142, 1)

### Iterate through smiles data to create mol_list

In [6]:
# Read in SMILES-only data using MolFromSmiles function
smiles_list = pd.read_csv('../data/conversion_data/smiles_only.csv')

mol_list = [Chem.MolFromSmiles(data[0]) for idx, data in smiles_list.iterrows()]
type(mol_list)

list

In [7]:
type(smiles_list)

pandas.core.frame.DataFrame

### Hashed Morgan Fingerprint and Fingerprint as Vector

#### Create Morgan Fingerprint data

In [8]:
# use Rdkit's GetMorganFingerprint to get hashed fingerprints
fp1 = pd.DataFrame([AllChem.GetMorganFingerprint(data, 2) for data in mol_list])

fp1.reset_index(inplace=True)
fp1.columns = ['index', 'FINGERPRINT']

fp1.to_pickle('../data/conversion_data/fingerprints.pkl')

#### Convert hashed morgan fingerpritn to Bit Vector

In [9]:
# Convert to BitVector
fp1_morgan_hash = pd.DataFrame([AllChem.GetMorganFingerprintAsBitVect(data,2,nBits=128) for data in mol_list])

fp1_morgan_hash.reset_index(inplace=True)
# fp1_morgan_hash.head()

fp1_morgan_hash.columns = ['index', 'MORGAN_HASH']
fp1_morgan_hash.head()

fp1_morgan_hash = fp1_morgan_hash.dropna()
# fp1_morgan_hash.isnull().sum()

fp1_morgan_hash.to_pickle('../data/conversion_data/morgan_hash.pkl')

type(fp1_morgan_hash['MORGAN_HASH'][0])

rdkit.DataStructs.cDataStructs.ExplicitBitVect

#### Convert bit vecotor to Bit String
For easier manipulation

In [10]:
# Convert to bit string
fp1_morgan_bitstr = pd.DataFrame([item.ToBitString()] for item in fp1_morgan_hash['MORGAN_HASH'])
fp1_morgan_bitstr.columns=['MORGAN_BTSR']

# Add index so there is a common 
fp1_morgan_bitstr.reset_index(inplace=True)

fp1_morgan_bitstr.to_pickle('../data/conversion_data/morgan_bitstr.pkl')

### Merge Smiles data into cyto_assay

In [11]:
smiles_merged = SID_smiles.merge(cyto_assay, how="inner", on="PUBCHEM_SID")

smiles_merged.to_pickle('../data/conversion_data/smiles_merged.pkl')

type(smiles_merged)

pandas.core.frame.DataFrame