Processing of TMPRSS2 Data.  Computes RDKit Morgan Fingerprints, converts acvalue units, creates binary Active/Inactive column, and matches the format of the processed pubchem datasets outputted by Datasets_1_Prepare_PubChem_Datasets.ipynb

In [1]:
import numpy as np
import pandas as pd

from rdkit.Chem import rdMolDescriptors, MolFromSmiles
from tqdm import tqdm
from functools import partial

In [2]:
TMPRSS2 = pd.read_csv('../data/TMPRSS2.csv')
TMPRSS2.head()

Unnamed: 0,ID,SMILES,Activity,CHEMBL,CID,SID
0,24,C1C[C@H](N(C1)C(=O)[C@@H](CCCN=C(N)N)NS(=O)(=O...,19.0,1229259,46899577,134460981
1,25,CC(C)(C)OC(=O)C[C@H](C(=O)N1CCC[C@H]1C(=O)NCC2...,19.0,1809215,56677005,134456506
2,26,C1C[C@H](N(C1)C(=O)[C@@H](CCCCN)NS(=O)(=O)CC2=...,20.0,1809216,49835415,134441143
3,27,C1C[C@H](N(C1)C(=O)[C@@H](CCCCN=C(N)N)NS(=O)(=...,21.0,1809213,49835557,134441142
4,28,N=C(C1=CC=C(C=C1)CNC([C@@H]2CCCN2C([C@H](CCC3=...,44.0,0,0,0


### Calculate Morgan Fingerpints

In [3]:
def row_to_fingerprint(row):
    mol = MolFromSmiles(row['SMILES'])
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, useChirality=True)
    bit_string = fp.ToBitString()
    return np.array([int(char) for char in bit_string], dtype=np.uint8)

In [4]:
TMPRSS2['morgan_fp'] = TMPRSS2.apply(row_to_fingerprint, axis=1)

In [5]:
TMPRSS2.head()

Unnamed: 0,ID,SMILES,Activity,CHEMBL,CID,SID,morgan_fp
0,24,C1C[C@H](N(C1)C(=O)[C@@H](CCCN=C(N)N)NS(=O)(=O...,19.0,1229259,46899577,134460981,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,25,CC(C)(C)OC(=O)C[C@H](C(=O)N1CCC[C@H]1C(=O)NCC2...,19.0,1809215,56677005,134456506,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,26,C1C[C@H](N(C1)C(=O)[C@@H](CCCCN)NS(=O)(=O)CC2=...,20.0,1809216,49835415,134441143,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,27,C1C[C@H](N(C1)C(=O)[C@@H](CCCCN=C(N)N)NS(=O)(=...,21.0,1809213,49835557,134441142,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,28,N=C(C1=CC=C(C=C1)CNC([C@@H]2CCCN2C([C@H](CCC3=...,44.0,0,0,0,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Match Standard Formatting

In [6]:
# example of standard formatting
processed_example = pd.read_pickle('../processed_data/ST14_processed.pkl')
processed_example.head()

Unnamed: 0,cid,acvalue,activity,morgan_fp
0,11974,0.535,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,432298,0.204,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,411406,0.191,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,135435498,4.5,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4735,1.16,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [7]:
# match formatting
TMPRSS2 = TMPRSS2.rename(columns={'Activity': 'acvalue', 'CID': 'cid'})
TMPRSS2 = TMPRSS2.drop(columns=['ID', 'CHEMBL', 'SID', 'SMILES'])
TMPRSS2.head()

Unnamed: 0,acvalue,cid,morgan_fp
0,19.0,46899577,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,19.0,56677005,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,20.0,49835415,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,21.0,49835557,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,44.0,0,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
# convert units: acvalue for TMPRSS2 data is measured in nano-molar; for the pubchem, it is micromolar.
TMPRSS2['acvalue'] = TMPRSS2['acvalue'] / 1000

We need to add a column specifying binary Active/Inactive.  The following method is pulled directly from Datasets_1_Prepare_PubChem_Datasets.ipynb.

In [9]:
activity_threshold = 50
TMPRSS2['activity'] = (TMPRSS2.acvalue < activity_threshold).map({True: 'Active', False: 'Inactive'})

In [10]:
# reorder columns, compare
TMPRSS2 = TMPRSS2[processed_example.columns]
print('TMPRSS2')
TMPRSS2.head()

TMPRSS2


Unnamed: 0,cid,acvalue,activity,morgan_fp
0,46899577,0.019,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,56677005,0.019,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,49835415,0.02,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,49835557,0.021,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0,0.044,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
print('processed pubchem example')
processed_example.head()

processed pubchem example


Unnamed: 0,cid,acvalue,activity,morgan_fp
0,11974,0.535,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,432298,0.204,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,411406,0.191,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,135435498,4.5,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4735,1.16,Active,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


### Export

In [12]:
# export
TMPRSS2.to_pickle(f'../processed_data/TMPRSS2_processed.pkl')

In [13]:
TMPRSS2

Unnamed: 0,cid,acvalue,activity,morgan_fp
0,46899577,0.019,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,56677005,0.019,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,49835415,0.020,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,49835557,0.021,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0,0.044,Active,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
87,90666144,0.013,Active,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
88,90666145,0.011,Active,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
89,90666146,0.003,Active,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
90,90666147,0.005,Active,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
