Cleaning

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from rdkit import Chem, DataStructs
from rdkit.Chem import rdchem
from rdkit.Chem import Mol
from rdkit.Chem import AllChem

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data and remove unnecessary header rows
cyto_assay = pd.read_csv('data/AID_1851_datatable_all.csv', skipinitialspace=True, header=[0,4])

In [3]:
# Drop the bottom level of the multi-index
cyto_assay.columns = cyto_assay.columns.droplevel(1)

In [4]:
cyto_assay['Inhibition Observed'] = cyto_assay['Inhibition Observed'].map({True:1, False:0}).astype(int)

In [5]:
cyto_assay.shape

(85715, 44)

In [6]:
cyto_assay.head()

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Inhibition Observed,Approved Drug,Collection,...,Activity at 11.43 uM,Activity at 25.56 uM,Activity at 57.14 uM,Activity at 0.00164 uM,Activity at 0.00366 uM,Activity at 0.00818 uM,Compound QC,Panel ID,Panel Name,Panel Target
0,1,842238,6602638.0,Inactive,,,,1,Biodiverse,,...,8.1486,,-15.6281,,,,QC'd by DPISMR,1,p450-cyp2c19,NP_000760.1
1,2,842238,6602638.0,Inconclusive,,,,1,Biodiverse,,...,-8.5062,,-37.6308,,,,QC'd by DPISMR,2,p450-cyp2d6,NP_001020332.1
2,3,842238,6602638.0,Inactive,,,,1,Biodiverse,,...,0.4027,,-2.2935,,-1.3363,,QC'd by DPISMR,3,p450-cyp3a4,NP_059488.2
3,4,842238,6602638.0,Inactive,,,,1,Biodiverse,,...,-7.392,,-11.9048,,,,QC'd by DPISMR,4,p450-cyp1a2,NP_000752.2
4,5,842238,6602638.0,Inconclusive,,,,1,Biodiverse,,...,-41.7035,,-10.7694,,,,QC'd by DPISMR,5,p450-cyp2c9,NP_000762.2


In [7]:
cyto_assay['Inhibition Observed'].unique()

array([1, 0])

In [8]:
cyto_assay['PUBCHEM_SID']

0          842238
1          842238
2          842238
3          842238
4          842238
           ...   
85710    26751441
85711    26751441
85712    26751441
85713    26751441
85714    26751441
Name: PUBCHEM_SID, Length: 85715, dtype: int64

In [9]:
# Separate out SID to use in PubChem's Data Download Tool to get SMILES data
pubchemsid = cyto_assay['PUBCHEM_SID']

In [10]:
# Save SID files
pubchemsid.to_csv('data/pubchemSID.csv', index=False)

## Combine SMILES with cyto_assay

In [11]:
# From https://github.com/comp-pharm/ml-psychs/blob/main/src/data/make_dataset.py and heavily modified

def make_data():
    # Load the SID to Activity data
#     datatable = pd.read_csv("data/AID_1851_datatable_all.csv", skiprows=[1, 2, 3, 4])
    datatable = pd.read_csv('data/AID_1851_datatable_all.csv', skipinitialspace=True, skiprows=[1, 2, 3, 4])
    sid_to_outcome = datatable[["PUBCHEM_SID", "PUBCHEM_ACTIVITY_OUTCOME"]]
    sid_to_outcome = sid_to_outcome.replace(
        {"PUBCHEM_ACTIVITY_OUTCOME": {"Inactive": 0, "Active": 1, "Inconclusive": 2}}
    )

    # Load the SID to SMILES data
    smiles = pd.read_csv("data/SIDtoSMILES.txt", names=["PUBCHEM_SID", "SMILES"], delim_whitespace=True)

    # Create the SMILES to SID data
    data = pd.merge(sid_to_outcome, smiles, how="inner", on="PUBCHEM_SID")
    data.dropna(inplace=True)  # One SID has no SMILES string, drop it
    data.drop("PUBCHEM_SID", axis=1, inplace=True)  # Drop the SID column as we are done with it
    data = data[["SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]]  # Reorder

    data.to_csv("data/SMILES_to_Activity.csv")

In [12]:
make_data()

## Feature Creation - Will have to add Fingerprint info/change this section later

In this section I will be using rdkit to create the features used in SwissADME's models   

This will be moved to the data dictionary later

Source: https://static-content.springer.com/esm/art%3A10.1038%2Fsrep42717/MediaObjects/41598_2017_BFsrep42717_MOESM91_ESM.pdf  


|#|Variable|Description|
|---|---|---|
|1|nF |number of fluorine atoms|
|2|sbonds |number of single bonds|
|3|dbonds|number of double bonds|
|4|tbonds|number of triple bonds|
|5|abonds|number of aromatic bonds|
|6|GetMolWt|molecular weight|
|7|NumAtoms|number of atoms|
|8|NumHvyAtoms|number of heavy atoms|
|9|AP|aromatic portion|
|10|NumAcceptor|number of H-bond acceptors|
|11|NumDonor|number of H-bond donors|
|12|Num Carbon|number of carbon atoms|
|13|NumHetero|number of heteroatoms|
|14|NumAromatic|number of aromatic atoms|
|15|NumRotors|number of rotatable bonds|
|16|NumRing|number of rings|
|17|TPSA|topological surface area|
|18|MR|molecular refractivityty|
|19|MlogpCX|weighted sum of carbon and halogen atoms|
|20|MlogpNO|total number of nitrogen and oxygen atoms|
|21|MLogpUB|number of undsaturated bonds|
|22|MLogpRNG|presence of ring structures|
|23|MLogpNO2|number of nitro groups|
|24|MLogpNCS|presence of thiocyanate or isothiocynanate|
|25|MLogpQN|presence of quarternary nitrogen or N-oxide|
|26|MLogpALK|presence of alane, alkene cyclocalckane or cycloalkene|
|27|MLogpHB|presence of intramolecular H-bond|
|28|MLogpPOL|number af aromatic substituents|
|29|MLogpBLM|presence of beta-lactam|
|30|MLogpAMP|presence of amphoteric property|
|31|MLogpPRX|proximity effect of nitrogen and oxygen atoms|
|32|mlogp|MLOGP|
|33|mhlogp|NC+NHET log <em>P</em>|
|34|alogp|WLOGP|
|35|logP|OpeBabel log <em>P</em>|
|36|NumSpiro|number of spiro groups|
|37|NumBridge|number of ringbridging atoms|
|38|NumStero|number of stereocenters|
|39|NumMacrocycle|number of macrocycles|
|40|sizePenalty|size penalty|
|41|macrocyclePenalty|macrocycle penalty|
|42|stereoComplexity|stereo complexity|
|43|ringComplexity|ring complexity|
|44|complexityPenalty|complexity penalty|
|45|ilogp|IGLOP|
|46|xlogp3|XLOGP3|
|47|silicos_logP|Filter-IT log <em>P</em>|
|48|logSwE|ESOL log <em>S</em>|
|49|logSwA|Ali log <em>S</em>|
|50|silicos_logS|Filter-IT log <em>S</em>|


In [13]:
# Read in SIDtoSMILES data to prep for use with rdkit
remove_SID = pd.read_csv('data/SIDtoSMILES.txt', delim_whitespace=True)
remove_SID.head()

Unnamed: 0,842238,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([C@H]2C(=O)O)C=C3
0,842250,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br
1,842319,C1=CC=NC(=C1)C=NNC(=O)C2=CC=C(C=C2)N3C=CC=C3
2,842408,CC1=C(C=C(C=C1)NC(=O)CCCC2=CC=CC=C2)[N+](=O)[O-]
3,842584,CCC[C@@H](C)C(=O)NC1=CC=CC=C1F
4,842618,C1OC2=C(O1)C=C(C=C2)C=CC(=O)NC3=NC=CC=N3


In [14]:
# Remove SID from SMILES data so SMILES data is useable in rdkit
remove_SID = remove_SID.drop(columns='842238')
remove_SID.head()

Unnamed: 0,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([C@H]2C(=O)O)C=C3
0,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br
1,C1=CC=NC(=C1)C=NNC(=O)C2=CC=C(C=C2)N3C=CC=C3
2,CC1=C(C=C(C=C1)NC(=O)CCCC2=CC=CC=C2)[N+](=O)[O-]
3,CCC[C@@H](C)C(=O)NC1=CC=CC=C1F
4,C1OC2=C(O1)C=C(C=C2)C=CC(=O)NC3=NC=CC=N3


In [15]:
# Save file as txt with no header or indexes
remove_SID.to_csv('data/smiles_only.csv', header=1, index=False, mode="a")

In [16]:
# Read in SMILES-only data
# Iterate through smiles data to create mol_list
smiles_list = pd.read_csv('data/smiles_only.csv')
mol_list = [Chem.MolFromSmiles(data[0]) for idx, data in smiles_list.iterrows()]
type(mol_list)

list

In [17]:
type(smiles_list)

pandas.core.frame.DataFrame

In [18]:
smiles_list.shape

(188569, 1)

In [19]:
smiles_list.head()

Unnamed: 0,CC1=CC(=NO1)C(=O)NN=CC2=CC=CC=C2Br
0,C1=CC=NC(=C1)C=NNC(=O)C2=CC=C(C=C2)N3C=CC=C3
1,CC1=C(C=C(C=C1)NC(=O)CCCC2=CC=CC=C2)[N+](=O)[O-]
2,CCC[C@@H](C)C(=O)NC1=CC=CC=C1F
3,C1OC2=C(O1)C=C(C=C2)C=CC(=O)NC3=NC=CC=N3
4,CC[C@H](C)OC(=O)C1=C(SC2=C1CCCC2)N


In [20]:
len(mol_list)

188569

In [21]:
mol_list[2].GetNumAtoms()

15

In [22]:
fp1 = pd.DataFrame([AllChem.GetMorganFingerprint(data, 2) for data in mol_list])

In [23]:
print(fp1[0])

0         <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1         <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
2         <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
3         <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
4         <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
                                ...                        
188564    <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
188565    <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
188566    <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
188567    <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
188568    <rdkit.DataStructs.cDataStructs.UIntSparseIntV...
Name: 0, Length: 188569, dtype: object


In [24]:
fp1.columns = ['FINGERPRINT']
fp1.head()

Unnamed: 0,FINGERPRINT
0,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
2,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
3,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
4,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...


In [25]:
fp1.to_csv('data/fingerprints')

In [26]:
fingerprints = pd.read_csv('data/fingerprints')
fingerprints.head()

Unnamed: 0.1,Unnamed: 0,FINGERPRINT
0,0,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1,1,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
2,2,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
3,3,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
4,4,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...


In [27]:
data = pd.read_csv('data/SMILES_to_Activity.csv')
data['Unnamed: 0'] = data.index
data.head()

Unnamed: 0.1,Unnamed: 0,SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,0,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0
1,1,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,2
2,2,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0
3,3,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0
4,4,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,2


In [28]:
fp_data = data.merge(fingerprints, how='inner')

In [29]:
fp_data.head()

Unnamed: 0.1,Unnamed: 0,SMILES,PUBCHEM_ACTIVITY_OUTCOME,FINGERPRINT
0,0,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1,1,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,2,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
2,2,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
3,3,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
4,4,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,2,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...


In [30]:
fp_data['PUBCHEM_ACTIVITY_OUTCOME'].unique()

array([0, 2, 1])

In [31]:
fp_data.to_csv('data/fingerprint_merged')

In [32]:
# Check to make sure it saved correctly
data = pd.read_csv("data/fingerprint_merged")

In [33]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,SMILES,PUBCHEM_ACTIVITY_OUTCOME,FINGERPRINT
0,0,0,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1,1,1,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,2,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
2,2,2,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
3,3,3,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,0,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
4,4,4,CC1=C(SC=C1)C(=O)NNC(=O)[C@@H]2[C@@H]3C[C@H]([...,2,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...


In [34]:
data.shape

(85715, 5)

In [35]:
# Setup train/test/val datasets, train_test_split and save to csv

def split_data():
    # Load the prepared data
    data = pd.read_csv("data/fingerprint_merged")
    X_train, X_testval, y_train, y_testval = train_test_split(data["SMILES"], data["PUBCHEM_ACTIVITY_OUTCOME"],
                                                              stratify=data["PUBCHEM_ACTIVITY_OUTCOME"], test_size=0.2)
    X_val, X_test, y_val, y_test = train_test_split(X_testval, y_testval,
                                                    stratify=y_testval, test_size=0.5)

    train_data = pd.DataFrame({"SMILES": X_train, "PUBCHEM_ACTIVITY_OUTCOME": y_train})
    val_data = pd.DataFrame({"SMILES": X_val, "PUBCHEM_ACTIVITY_OUTCOME": y_val})
    test_data = pd.DataFrame({"SMILES": X_test, "PUBCHEM_ACTIVITY_OUTCOME": y_test})

    train_data.to_csv("data/train_SMILES_to_Activity.csv", index=False)
    val_data.to_csv("data/val_SMILES_to_Activity.csv", index=False)
    test_data.to_csv("data/test_SMILES_to_Activity.csv", index=False)


if __name__ == "__main__":
    make_data()
    split_data()

In [36]:
#Instantiate split_data
split_data()

## Split into individual CYP450 enzymes

#### To be implemented after testing modeling for MVP

In [37]:
# cyp2c19 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp2c19']
# print(cyp2c19.shape)
# # cyp2c19.to_csv()

In [38]:
# cyp2c9 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp2c9']
# print(cyp2c9.shape)

In [39]:
# cyp1a2 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp1a2']
# print(cyp1a2.shape)

In [40]:
# cyp2d6 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp2d6']
# print(cyp2d6.shape)

In [41]:
# cyp3a4 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp3a4']
# print(cyp3a4.shape)