Straightforward Processing of TMPRSS2 Data, just matches the format of the processed pubchem datasets outputted by Datasets_1_Prepare_PubChem_Datasets.ipynb

In [None]:
import pandas as pd

In [None]:
TMPRSS2 = pd.read_csv('../data/TMPRSS2.csv')
TMPRSS2.head()

In [None]:
# example of standard formatting
processed_example = pd.read_csv('../dumps/ST14_processed.csv')
processed_example.head()

In [None]:
# match formatting
TMPRSS2 = TMPRSS2.rename(columns={'SMILES': 'smiles', 'Activity': 'acvalue', 'CID': 'cid'})
TMPRSS2 = TMPRSS2.drop(columns=['ID', 'CHEMBL', 'SID'])
TMPRSS2.head()

In [None]:
# convert units: acvalue for TMPRSS2 data is measured in nano-molar; for the pubchem, it is micromolar.
TMPRSS2['acvalue'] = TMPRSS2['acvalue'] / 1000

We need to add a column specifying binary Active/Inactive.  The following method is pulled directly from Datasets_1_Prepare_PubChem_Datasets.ipynb.

In [None]:
activity_threshold = 50
TMPRSS2['activity'] = (TMPRSS2.acvalue < activity_threshold).map({True: 'Active', False: 'Inactive'})

In [None]:
# reorder columns, compare
TMPRSS2 = TMPRSS2[processed_example.columns]
print('TMPRSS2')
TMPRSS2.head()

In [None]:
print('processed pubchem example')
processed_example.head()

In [None]:
# export
TMPRSS2.to_csv('../dumps/TMPRSS2_processed.csv', index=False)

In [None]:
TMPRSS2

In [None]:
df = TMPRSS2
ds_name = 'TMPRSS2'

In [None]:
from rdkit.Chem import rdMolDescriptors, MolFromSmiles
from tqdm import tqdm
from functools import partial
import numpy as np

In [None]:
cid_to_rdkit = {}
dummy_cid = -1
for i, row in tqdm(df.iterrows(), total=len(df)):
    mol = MolFromSmiles(row.smiles)
    if row.cid == 0:
        cid = dummy_cid
        dummy_cid -=1
    else:
        cid = row.cid
    cid_to_rdkit[cid] =  mol

In [None]:
fingperint_function = partial(rdMolDescriptors.GetMorganFingerprintAsBitVect, 
                              radius=2, useChirality=True)
fp_name = 'morgan'

In [None]:
def fingerprint_to_np(fp):
    bit_string = fp.ToBitString()
    return np.array([int(char) for char in bit_string], dtype=np.uint8)

In [None]:
np_fingerprints = {str(cid): fingerprint_to_np(fingperint_function(mol)) for cid, mol in cid_to_rdkit.items()}

In [None]:
np.savez(f'../dumps/{ds_name}_{fp_name}_fingerprints.npz', **np_fingerprints)