Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

In [1]:
import pandas as pd
import datamol as dm

import os
import sys
import pathlib

# utils
root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))

In [2]:
org = "novartis"
data_name = "CYP"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"


# Load the data
source_data_path = f"{gcp_root}/data/raw/surrogate_training_set_public.parquet"
data = pd.read_parquet(source_data_path)

We can get the dataset directly from ChEMBL (https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/ gives the overall experimental data information, and subsequent numbers give data for a specific endpoint). The following map gives information on each:

In [4]:
# load data
data_train = pd.read_parquet(source_data_path)
data_train

Unnamed: 0,Structure,log_kobs,pIC50_CYP3A4,pIC50_CYP2C9,pIC50_CYP2D6,log_kobs_model_0,log_kobs_model_1,log_kobs_model_2,log_kobs_model_3,log_kobs_model_4,...,pIC50_CYP2D6_model_1,pIC50_CYP2D6_model_2,pIC50_CYP2D6_model_3,pIC50_CYP2D6_model_4,pIC50_CYP2D6_model_5,pIC50_CYP2D6_model_6,pIC50_CYP2D6_model_7,pIC50_CYP2D6_model_8,pIC50_CYP2D6_model_9,Source
0,CCCCc1nc2cc(/C=C/C(=O)NO)ccc2n1CCN(CC)CC.Cl.Cl,-2.131,4.957,4.733,4.823,-2.145,-1.988,-1.912,-2.347,-2.049,...,4.728,4.970,4.806,4.834,4.769,4.993,4.855,4.770,4.741,ChEMBL
1,CCCCc1ccc(N/C=N/O)cc1,-2.300,4.597,4.857,4.716,-2.345,-2.287,-2.399,-2.354,-2.209,...,4.737,4.684,4.614,4.852,4.603,4.737,4.732,4.694,4.901,ChEMBL
2,C[C@H](Nc1cc(-c2sc(C3CCN(C)CC3)nc2-c2ccc(F)cc2...,-1.940,4.740,4.686,4.883,-1.935,-1.995,-1.863,-2.123,-1.896,...,4.807,4.914,4.870,4.873,4.865,4.911,4.918,4.809,5.019,ChEMBL
3,CC(C)c1nc(-c2ccc(F)cc2)c(-c2ccc3nc(N)n(S(=O)(=...,-2.151,4.453,4.543,4.525,-2.220,-2.235,-2.149,-2.219,-2.112,...,4.497,4.470,4.533,4.547,4.578,4.443,4.625,4.477,4.492,ChEMBL
4,CC(C)S(=O)(=O)n1c(N)nc2ccc(-c3[nH]c(C(C)(C)C)n...,-2.283,4.372,4.432,4.463,-2.379,-2.294,-2.291,-2.323,-2.256,...,4.414,4.323,4.524,4.527,4.545,4.367,4.571,4.422,4.411,ChEMBL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16368,c1cc2c3c(c1)ccc1cccc(c13)C2,-1.773,5.450,4.925,5.054,-1.820,-1.939,-1.548,-1.693,-1.858,...,4.841,5.648,4.886,4.872,5.146,4.817,4.800,4.986,5.085,PubChem AID 884
16369,CC(=O)OC1CC2C(C)(C)C(O)C=C[C@@]2(C)C2CC[C@@]3(...,-1.622,5.840,4.694,4.692,-1.842,-1.648,-1.376,-2.188,-1.422,...,4.695,4.786,4.726,4.694,4.622,4.638,4.781,4.760,4.640,PubChem AID 884
16370,COc1c(O)cc2c(-c3ccccc3)cc(=O)oc2c1OC,-1.989,4.727,5.082,4.697,-1.894,-1.986,-2.002,-2.020,-2.018,...,4.701,4.695,4.733,4.659,4.654,4.682,4.736,4.748,4.661,PubChem AID 884
16371,CCN(CC)CCOC(=O)C(Cc1cccc2ccccc12)CC1CCCO1.O=C(...,-2.248,4.704,4.719,4.730,-2.169,-2.237,-2.160,-2.305,-2.272,...,4.723,4.746,4.764,4.726,4.719,4.695,4.735,4.676,4.787,PubChem AID 884


If we look at the columns in this raw dataframe, we see there are a lot of columns that we don't need. We will focus only on necessary columns below.

In [5]:
mol_col = "Structure"

columns_to_keep = [mol_col, "log_kobs", "pIC50_CYP3A4", "pIC50_CYP2C9", "pIC50_CYP2D6"]
data_train = data_train[columns_to_keep].copy()

data_train["split"] = "train"

In [6]:
# # Map the trehsholds
# data_train = _class_conversion(data_train, ["log_kobs"], {"log_kobs": {"thresholds": [np.log10(0.01), np.log10(0.025)], "label_order": "ascending"}} , prefix="CLASS")
# data_train = data_train.rename(columns={"CLASSlog_kobs": "CLASS_CYP3A4"})

In [7]:
# Define data column names
data_cols = ["log_kobs", "pIC50_CYP3A4", "pIC50_CYP2C9", "pIC50_CYP2D6"]

In [8]:
import numpy as np

# load test data
data_test = pd.read_parquet(f"{gcp_root}/data/raw/test_set_Zimmerlin2011.parquet")

data_test["split"] = "test"

# add log_kobs to keep consistence with training set
data_test["log_kobs"] = data_test.kobs.apply(np.log10)

In [9]:
# # Map the trehsholds
# data_test = _class_conversion(data_test, ["kobs"], {"kobs": {"thresholds": [0.01, 0.025], "label_order": "ascending"}} , prefix="CLASS")
# data_test = data_test.rename(columns={"CLASSkobs": "CLASS_CYP3A4"})

In [10]:
# check the smiles and remove empty rows
print(data_test.shape)
data_test = data_test.dropna(subset=mol_col)
print(data_test.shape)

(368, 5)
(368, 5)


In [11]:
# merge the train and test set
data = pd.concat([data_train, data_test], axis=0, ignore_index=True)

In [12]:
data

Unnamed: 0,Structure,log_kobs,pIC50_CYP3A4,pIC50_CYP2C9,pIC50_CYP2D6,split,kobs,Non-proprietary name
0,CCCCc1nc2cc(/C=C/C(=O)NO)ccc2n1CCN(CC)CC.Cl.Cl,-2.131000,4.957,4.733,4.823,train,,
1,CCCCc1ccc(N/C=N/O)cc1,-2.300000,4.597,4.857,4.716,train,,
2,C[C@H](Nc1cc(-c2sc(C3CCN(C)CC3)nc2-c2ccc(F)cc2...,-1.940000,4.740,4.686,4.883,train,,
3,CC(C)c1nc(-c2ccc(F)cc2)c(-c2ccc3nc(N)n(S(=O)(=...,-2.151000,4.453,4.543,4.525,train,,
4,CC(C)S(=O)(=O)n1c(N)nc2ccc(-c3[nH]c(C(C)(C)C)n...,-2.283000,4.372,4.432,4.463,train,,
...,...,...,...,...,...,...,...,...
16736,NC2C=CN([C@H]1CC[C@@H](CO)O1)C(=O)N=2,-2.221849,,,,test,0.006,Zalcitabine
16737,COc1cc(ccc1OC(F)F)C2C=CC(=O)NN=2,-2.096910,,,,test,0.008,Zardaverine
16738,CC2=CN([C@H]1C[C@H](N=[N+]=[N-])[C@@H](CO)O1)C...,-2.096910,,,,test,0.008,Zidovudine
16739,CC(C2=Cc1ccccc1S2)N(O)C(N)=O,-2.301030,,,,test,0.005,Zileuton


In [13]:
data = data.dropna(subset=mol_col).reset_index(drop=True)

In [14]:
fout = f"{gcp_root}/data/raw/train_test.parquet"
data.to_parquet(fout, index=False)

In [15]:
fout

'gs://polaris-public/polaris-recipes/org-novatis/CYP/data/raw/train_test.parquet'