Processing of TMPRSS2 Data.  Computes RDKit Morgan Fingerprints, converts acvalue units, creates binary Active/Inactive column, and matches the format of the processed pubchem datasets outputted by Datasets_1_Prepare_PubChem_Datasets.ipynb

In [None]:
import numpy as np
import pandas as pd

from rdkit.Chem import rdMolDescriptors, MolFromSmiles
from tqdm import tqdm
from functools import partial

In [None]:
TMPRSS2 = pd.read_csv('../data/TMPRSS2.csv')
TMPRSS2.head()

### Calculate Morgan Fingerpints

In [None]:
def row_to_fingerprint(row):
    mol = MolFromSmiles(row['SMILES'])
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, useChirality=True)
    bit_string = fp.ToBitString()
    return np.array([int(char) for char in bit_string], dtype=np.uint8)

In [None]:
TMPRSS2['morgan_fingerprint'] = TMPRSS2.apply(row_to_fingerprint, axis=1)

In [None]:
TMPRSS2.head()

### Match Standard Formatting

In [None]:
# example of standard formatting
processed_example = pd.read_pickle('../processed_data/ST14_processed.pkl')
processed_example.head()

In [None]:
# match formatting
TMPRSS2 = TMPRSS2.rename(columns={'Activity': 'acvalue', 'CID': 'cid'})
TMPRSS2 = TMPRSS2.drop(columns=['ID', 'CHEMBL', 'SID', 'SMILES'])
TMPRSS2.head()

In [None]:
# convert units: acvalue for TMPRSS2 data is measured in nano-molar; for the pubchem, it is micromolar.
TMPRSS2['acvalue'] = TMPRSS2['acvalue'] / 1000

We need to add a column specifying binary Active/Inactive.  The following method is pulled directly from Datasets_1_Prepare_PubChem_Datasets.ipynb.

In [None]:
activity_threshold = 50
TMPRSS2['activity'] = (TMPRSS2.acvalue < activity_threshold).map({True: 'Active', False: 'Inactive'})

In [None]:
# reorder columns, compare
TMPRSS2 = TMPRSS2[processed_example.columns]
print('TMPRSS2')
TMPRSS2.head()

In [None]:
print('processed pubchem example')
processed_example.head()

### Export

In [None]:
# export
TMPRSS2.to_pickle(f'../processed_data/TMPRSS2_processed.pkl')

In [None]:
TMPRSS2