In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle
from tqdm import tqdm
tqdm.pandas()
import statistics
from pubchempy import Compound
from rdkit import Chem, DataStructs
from rdkit.Chem import SaltRemover, QED, rdMolDescriptors
from molvs import Standardizer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Datasets Task 1: Prepare PubChem Datasets

Author: Kaan Donbekci (donbekci@stanford.edu)

## Contents
* [1.1 Assemble Dataset](#1.1-Assemble-Dataset)
* [1.2 Sanitize Molecules](#1.2-Sanitize-Molecules)
* [1.3 Remove non-druglike molecules](#1.3-Remove-non-druglike-molecules)
* [1.4 Resolve errors](#1.4-Resolve-errors)
    * [1.4.1 Calculate pairwise similarities using fingerprints](#1.4.1-Calculate-pairwise-similarities-using-fingerprints)
    * [1.4.2 Find and remove duplicates](#1.4.2-Find-and-remove-duplicates) # TODO
    * [1.4.3 Find and remove activity cliffs](#1.4.3-Find-and-remove-activity-cliffs) # TODO
* [Exports](#Exports)

In [2]:
plots = False

In [3]:
AVAILABLE_DATASETS = ['ST14', 'KLKB1', 'TMPRSS11D', 'TMPRSS6']

In [4]:
ds_name = 'TMPRSS6'

In [5]:
assert ds_name in AVAILABLE_DATASETS

In [6]:
def load_pickle(filename):
    with open(f'../dumps/{filename}.pkl', 'rb') as f:
        return pickle.load(f)

In [7]:
try:
    cid_to_pubchem = load_pickle(f'{ds_name}_cid_to_pubchem')
except:
    print('will send requests to pubchempy, might take a while')

## 1.1 Assemble Dataset

First step is to read the dataset as exported from 
[pubchempy](https://pubchem.ncbi.nlm.nih.gov/gene/6768#section=Tested-Compounds&fullscreen=true).


In [8]:
df = pd.read_csv(f'../data/{ds_name}.csv')

Let's drop the rows where the Activity value is not measured in Ki. 

In [9]:
Counter(df.acname)

Counter({'Ki': 111, nan: 29, 'IC50': 34})

In [10]:
df_IC50 = df[(df.acname == 'IC50')]

In [11]:
IC50_assays = [str(x) for x in list(df_IC50.aid.unique())]
with open(f'../dumps/{ds_name}_IC50_assays.txt', 'w') as f:
    f.write('\n'.join(IC50_assays))

In [12]:
df = df[df.acname == 'Ki'].reset_index(drop=True)

Replace categorical labels of unspecified with threshold values

In [13]:
Counter(df.activity)

Counter({'Active': 85, 'Unspecified': 26})

In [14]:
activity_threshold = 50

In [15]:
assert len(df[(df.acvalue < activity_threshold) & (df.activity == 'Inactive')]) == 0
df.loc[(df.acvalue<activity_threshold), 'activity'] = 'Active'

In [16]:
assert len(df[(df.acvalue >= activity_threshold) & (df.activity == 'Active')]) == 0
df.loc[(df.acvalue>=activity_threshold) , 'activity'] = 'Inactive'

Remove rows with nan activity values and unspecified activity

In [17]:
df = df.drop(df[(pd.isna(df.acvalue)) & (df.activity == 'Unspecified')].index)

Some compounds have multiple rows in the dataset, use median activity value and reduce them to a single row.

In [18]:
cid_to_rows = {}
for i, row in df.iterrows():
    if row.cid not in cid_to_rows:
        cid_to_rows[row.cid] = []
    cid_to_rows[row.cid].append(row)

In [19]:
df.activity.unique()

array(['Active', 'Inactive'], dtype=object)

In [20]:
valid_activities = set(['Active', 'Inactive'])

In [21]:
cleaned_rows = []
problem_rows = []
for cid, rows in tqdm(cid_to_rows.items()):
    if len(rows) == 1:
        row = rows[0]
        cleaned_rows.append(row[['cid', 'acvalue', 'activity']])
    else:
        activities = []
        acvalues = []
        for row in rows:
            activities.append(row.activity)
            acvalues.append(row.acvalue)
#             if row.acvalue
        activities = set(activities)
        
        if len(activities) != 1:
            problem_rows.append(rows)
            continue
        activity = activities.pop()
        if activity not in valid_activities:
            continue
#         acvalues = np.array(acvalues)[np.where(pd.notna(acvalues))]
        acvalue = np.nanmedian(acvalues)
        row = pd.Series({'cid': cid, 'acvalue': acvalue, 'activity': activity})
        cleaned_rows.append(row)

100%|██████████| 106/106 [00:00<00:00, 1877.41it/s]


In [22]:
df = pd.DataFrame(cleaned_rows).reset_index(drop=True)

In [23]:
len(df)

106

Query pubchempy to get SMILES codes and keep a dictionary of the compounds.

In [24]:
try:
    len(cid_to_pubchem) != 0
except:
    cid_to_pubchem = {}
    for i, row in tqdm(df.iterrows(), total=len(df)):
        compound = Compound.from_cid(row.cid)
        cid_to_pubchem[row.cid] = compound

In [25]:
df['smiles'] = None

In [26]:
def set_smiles(row):
    compound = cid_to_pubchem[row.cid]
    row.smiles = compound.isomeric_smiles
    return row

In [27]:
df = df.progress_apply(set_smiles, axis=1)

100%|██████████| 106/106 [00:00<00:00, 3807.45it/s]


In [28]:
df.head()

Unnamed: 0,cid,acvalue,activity,smiles
0,72429,3.25,Active,CC(C)C[C@@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](...
1,46899735,0.375,Active,C1CCC(CC1)C[C@H](C(=O)N2CCC[C@H]2C(=O)NCC3=CC=...
2,49864062,0.18,Active,C1C[C@H](N(C1)C(=O)[C@@H](CCCN=C(N)N)NS(=O)(=O...
3,46899577,0.17,Active,C1C[C@H](N(C1)C(=O)[C@@H](CCCN=C(N)N)NS(=O)(=O...
4,70689167,0.0033,Active,C[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)C1=NC2=CC...


## 1.2 Sanitize Molecules

Sanitization has two steps: first, standardize the molecule, then, remove the salts from it. We will use RDKit for both tasks.

In [29]:
# EXPORT
cid_to_rdkit = {}

In [30]:
s = Standardizer()
remover = SaltRemover.SaltRemover()
print(f'len(remover.salts) = {len(remover.salts)}')

len(remover.salts) = 15


In [31]:
for i, row in tqdm(df.iterrows(), total=len(df)):
    mol = Chem.MolFromSmiles(row.smiles)
    mol = s.standardize(mol)
    mol = remover.StripMol(mol)
    cid_to_rdkit[row.cid] = mol

100%|██████████| 106/106 [00:00<00:00, 220.75it/s]


## 1.3 Remove non-druglike molecules

In [32]:
property_keys = {'molecular weight': 'MW', 'polar surface area': 'PSA', 'LogP': 'ALOGP', 
                 'rotateable bonds': 'ROTB', 'h-bond donors': 'HBD', 'h-bond acceptors': 'HBA'}

In [33]:
# EXPORT
qed_properties = {key: {} for key in property_keys}

In [34]:
for cid, mol in tqdm(cid_to_rdkit.items()):
    mol_props = QED.properties(mol)
    for key in property_keys:
        qed_properties[key][cid] = mol_props.__getattribute__(property_keys[key])
qed_properties_df = pd.DataFrame(qed_properties)

100%|██████████| 106/106 [00:00<00:00, 417.70it/s]


In [35]:
qed_properties_df.index.name = 'cid'
qed_properties_df.head()

Unnamed: 0_level_0,molecular weight,polar surface area,LogP,rotateable bonds,h-bond donors,h-bond acceptors
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
72429,426.562,168.77,-0.1946,14,5,6
46899735,553.729,145.45,3.03647,11,4,5
49864062,556.693,209.85,0.11967,13,6,7
46899577,556.693,209.85,0.11967,13,6,7
70689167,646.779,315.17,-2.3467,19,9,11


In [36]:
if plots:
    for key, prop in qed_properties.items():
        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
        fig.suptitle(key)
        sns.boxplot(list(prop.values()), ax=axes[1])
        sns.distplot(list(prop.values()), ax=axes[0])

In [37]:
Q1 = qed_properties_df.quantile(.25)
Q3 = qed_properties_df.quantile(.75)
IQR = Q3 - Q1
threshold = 1.5

In [38]:
qed_properties_outliers_removed_df = qed_properties_df[~(((qed_properties_df < (Q1 - threshold*IQR)) |  (qed_properties_df > (Q3 + threshold*IQR))).any(axis=1))]
print(f'{len(qed_properties_df) - len(qed_properties_outliers_removed_df)} outliers removed.')
qed_properties_df = qed_properties_outliers_removed_df

15 outliers removed.


In [39]:
qed_properties = qed_properties_df.to_dict()
cids_to_keep = list(qed_properties_df.index)

In [40]:
cid_to_rdkit = {cid: cid_to_rdkit[cid] for cid in cids_to_keep}
df = df.query('cid in @cids_to_keep').reset_index(drop=True)

In [41]:
if plots:
    for key, prop in qed_properties.items():
        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
        fig.suptitle(f'{key} (w/o outliers)')
        sns.boxplot(list(prop.values()), ax=axes[1])
        sns.distplot(list(prop.values()), ax=axes[0])

## 1.4 Resolve errors

### 1.4.1 Calculate pairwise similarities using fingerprints

In [42]:
N = len(cids_to_keep)

In [43]:
assert (N == len(cid_to_rdkit) and N == len(df))

In [44]:
from functools import partial

In [45]:
fingperint_function = partial(rdMolDescriptors.GetMorganFingerprintAsBitVect, 
                              radius=2, useChirality=True)
fp_name = 'morgan'

In [46]:
# EXPORT
cid_to_fingerprint = {cid: fingperint_function(mol) for cid, mol in cid_to_rdkit.items()
                     }
fingerprint_similarity_matrix = np.empty((N, N))

for i, (cid1, fps1) in tqdm(enumerate(cid_to_fingerprint.items()), total=len(cid_to_fingerprint)):
    for j, (cid2, fps2) in enumerate(cid_to_fingerprint.items()):
        fingerprint_similarity_matrix[i, j] = DataStructs.FingerprintSimilarity(fps1, fps2)

100%|██████████| 91/91 [00:00<00:00, 3300.20it/s]


In [47]:
if plots:
    fig, ax = plt.subplots(figsize=(25,25))
    cax = ax.matshow(fingerprint_similarity_matrix, interpolation='nearest')
    ax.grid(False)
    plt.title('RDKIT fingerprint similarity matrix')
    plt.xticks(range(N), cids_to_keep, rotation=90);
    plt.yticks(range(N), cids_to_keep);
    ax.tick_params(axis='both', which='major', labelsize=4)
    _=fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, .75,.8,.85,.90,.95,1])
    plt.savefig(f'../dumps/{ds_name}_fingerprint_similarity_matrix.png', dpi=400)

In [48]:
if plots:
    cg = sns.clustermap(fingerprint_similarity_matrix, cbar_pos=None, figsize=(15, 15))
    plt.xticks(rotation=90)
    fig = cg.fig
    _ = fig.suptitle('RDKIT fingerprint similarity matrix (clustered)')
    plt.savefig(f'../dumps/{ds_name}_fingerprint_similarity_matrix_clustered', dpi=400)

In [49]:
def fingerprint_to_np(fp):
    bit_string = fp.ToBitString()
    return np.array([int(char) for char in bit_string], dtype=np.uint8)

In [50]:
def add_fingerprint(row):
    row.rdkit_fingerprint = fingerprint_to_np(cid_to_fingerprint[row.cid])
    return row

### 1.4.2 Find and remove duplicates

In [51]:
upper_triangle = (~np.eye(fingerprint_similarity_matrix.shape[0],dtype=bool) * np.triu(fingerprint_similarity_matrix))

In [52]:
similarity_threshold = .90

In [53]:
duplicates = set()
ix_to_cid = {i: key for i, key in enumerate(cid_to_fingerprint.keys())}
while True:
    candidates = {}
    for i, (cid1, fps1) in enumerate(cid_to_fingerprint.items()):
        if i in duplicates: continue
        similar = np.where(upper_triangle[i] > similarity_threshold)[0]
        if len(similar) > 0:
            for j in similar:
                if j in duplicates: continue
                candidates[j] = candidates.get(j, 0) + 1 
                candidates[i] = candidates.get(i, 0) + 1 
    if len(candidates) == 0:
        break
    sorted_candidates = sorted([(val, key) for key, val in candidates.items()], reverse=True)
    duplicates.add(sorted_candidates[0][1])
print('Will remove {} ({:.1%}) compounds.'.format(len(duplicates), len(duplicates)/N))

Will remove 4 (4.4%) compounds.


In [54]:
for i, (cid1, fps1) in enumerate(cid_to_fingerprint.items()):
    assert cid1 == ix_to_cid[i]

In [55]:
duplicates = {ix_to_cid[i] for i in duplicates}

In [56]:
cids_to_keep = list(filter(lambda x: x not in duplicates, cids_to_keep))

In [57]:
df = df.query('cid in @cids_to_keep')

In [58]:
cid_to_rdkit = {cid: cid_to_rdkit[cid] for cid in cids_to_keep}
cid_to_fingerprint = {cid: cid_to_fingerprint[cid] for cid in cids_to_keep}

In [59]:
assert len(cids_to_keep) == len(cid_to_rdkit) == len(df)

### 1.4.3 Find and remove activity cliffs

In [60]:
N = len(df)

In [61]:
# EXPORT
fingerprint_similarity_matrix = np.empty((N, N))

for i, (cid1, fps1) in tqdm(enumerate(cid_to_fingerprint.items()), total=len(cid_to_fingerprint)):
    for j, (cid2, fps2) in enumerate(cid_to_fingerprint.items()):
        fingerprint_similarity_matrix[i, j] = DataStructs.FingerprintSimilarity(fps1, fps2)
upper_triangle = (~np.eye(fingerprint_similarity_matrix.shape[0],dtype=bool) * np.triu(fingerprint_similarity_matrix))

100%|██████████| 87/87 [00:00<00:00, 3401.77it/s]


In [62]:
similarity_threshold = .85
activitiy_ratio_threshold = 100 #this is in folds as in 100-times fold.

In [63]:
cid_to_acvalue = {}
for i, row in df.iterrows():
    cid_to_acvalue[row.cid] = row.acvalue

In [64]:
irregularities = set()
ix_to_cid = {i: key for i, key in enumerate(cid_to_fingerprint.keys())}
for i, (cid1, fps1) in enumerate(cid_to_fingerprint.items()):
    acvalue1 = cid_to_acvalue[cid1]
    similar = np.where(upper_triangle[i] > similarity_threshold)[0]
    if len(similar) > 0:
        for j in similar:
            cid2 = ix_to_cid[j]
            if cid_to_acvalue[cid2] / acvalue1 > activitiy_ratio_threshold or acvalue1 / cid_to_acvalue[cid2] > activitiy_ratio_threshold:
                irregularities.add(cid1)
                irregularities.add(cid2)

print('Will remove {} ({:.1%}) compounds.'.format(len(irregularities), len(irregularities)/N))

Will remove 0 (0.0%) compounds.


In [65]:
cids_to_keep = list(filter(lambda x: x not in irregularities, cids_to_keep))

In [66]:
df = df.query('cid in @cids_to_keep')

In [67]:
cid_to_rdkit = {cid: cid_to_rdkit[cid] for cid in cids_to_keep}
cid_to_fingerprint = {cid: cid_to_fingerprint[cid] for cid in cids_to_keep}

In [68]:
assert len(cids_to_keep) == len(cid_to_rdkit) == len(df)

## Exports & Imports

In [69]:
# remove smiles, add fingerprints, save
df_out = df.drop('smiles', axis=1)  # smiles no longer reflects the processed molecule
df_out['morgan_fp'] = df_out.apply(lambda row: fingerprint_to_np(cid_to_fingerprint[row.cid]), axis=1)
df_out.to_pickle(f'../processed_data/{ds_name}_processed.pkl')

In [70]:
# dump cid_to_pubchem for chaching purposes
# dump cid_to_rdkit for ??? purposes
def save_pickle(obj, filename):
    with open(f'../dumps/{filename}.pkl', 'wb') as f:
        pickle.dump(obj, f)
save_pickle(cid_to_pubchem, f'{ds_name}_cid_to_pubchem')
save_pickle(cid_to_rdkit, f'{ds_name}_cid_to_rdkit')