In [1]:
import pandas as pd
import numpy as np
import dill
import rdkit.Chem as Chem
from utils import standardize, set_seeds

import rdkit.Chem as Chem
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.Descriptors import CalcMolDescriptors
import ray

set_seeds(42)

In [2]:
chembl_scored_enamine_df = pd.read_csv('./datasets/enamine_liquid_stock_chembl_scored.csv')
chembl_scored_enamine_df = chembl_scored_enamine_df.rename({'id': 'CODE', 'model_score': 'chembl_model_score'}, axis=1)
chembl_scored_enamine_df = chembl_scored_enamine_df.drop(['standard_smiles', 'smiles'], axis=1)

original_enamine_df = pd.read_csv('./datasets/Enamine_Liquid-Stock-Collection_879561cmpds_20240925_filter_wash.txt', sep='\t')

combined_df = chembl_scored_enamine_df.merge(original_enamine_df, on='CODE')
assert (combined_df['CODE'] == combined_df['Molecule Name']).all()

In [3]:
with open('./saved_models/liver.pkl', 'rb') as infile:
    liver_model = dill.load(infile)

with open('./saved_models/parasite.pkl', 'rb') as infile:
    parasite_model = dill.load(infile)

In [4]:
def add_features(row):
    with BlockLogs():
        mol = Chem.MolFromInchi(row["inchi"])
        return { **row, **CalcMolDescriptors(mol) }
    

def parasite_model_score(batch, parasite_model):
    with BlockLogs():
        batch["mol"] = batch["inchi"].map(Chem.MolFromInchi)

    try:
        prediction_probs = parasite_model.predict_proba(batch[parasite_model.feature_names_in_])
        predictions = parasite_model.predict(batch[parasite_model.feature_names_in_])
        batch['parasite_prediction_probs'] = prediction_probs[:, 1]
        batch['parasite_prediction'] = predictions
        batch['parasite_threshold'] = parasite_model.best_threshold_
        return batch
    except ValueError:
        batch['parasite_prediction_probs'] = None
        batch['parasite_prediction'] = None
        batch['parasite_threshold'] = None
        return batch

def liver_model_score(batch, liver_model):
    with BlockLogs():
        batch["mol"] = batch["inchi"].map(Chem.MolFromInchi)

    try:
        prediction_probs = liver_model.predict_proba(batch[liver_model.feature_names_in_])
        predictions = liver_model.predict(batch[liver_model.feature_names_in_])
        batch['liver_prediction_probs'] = prediction_probs[:, 1]
        batch['liver_prediction'] = predictions
        batch['liver_threshold'] = liver_model.best_threshold_
        return batch
    except ValueError:
        batch['liver_prediction_probs'] = None
        batch['liver_prediction'] = None
        batch['liver_threshold'] = None
        return batch

In [5]:
bsize = len(combined_df) // 16
ds = ray.data.from_pandas(combined_df).repartition(bsize)
ds = (
    ds
    .map(lambda row: {**row, 'smiles': row['Smiles']})
    .map(lambda row: {**row, 'inchi': standardize(row['smiles'])})
    .map(add_features)
)


ds = ds.map_batches(
    parasite_model_score,
    fn_args=[parasite_model],
    batch_format="pandas",
)

ds = ds.map_batches(
    liver_model_score,
    fn_args=[liver_model],
    batch_format="pandas",
)

relevant_cols = combined_df.drop(['chembl_model_score'], axis=1).columns.tolist() + [
    'parasite_prediction_probs', 
    'parasite_prediction', 
    'parasite_threshold',
    'liver_prediction_probs', 
    'liver_prediction', 
    'liver_threshold'
]

ds = ds.map_batches(
    lambda batch, cols: batch[cols],
    fn_args=[relevant_cols],
    batch_format="pandas",
    zero_copy_batch=True
)

2025-01-29 19:43:44,306	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


In [6]:
df = ds.to_pandas()
df.to_csv('./enamine_scores.csv')

2025-01-29 19:43:46,990	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-01-29_19-43-43_040694_47318/logs/ray-data
2025-01-29 19:43:47,020	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Repartition] -> TaskPoolMapOperator[Map(<lambda>)->Map(<lambda>)->Map(add_features)->MapBatches(parasite_model_score)->MapBatches(liver_model_score)->MapBatches(<lambda>)]


Running 0: 0.00 row [00:00, ? row/s]

- Repartition 1: 0.00 row [00:00, ? row/s]

Split Repartition 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]



- Map(<lambda>)->Map(<lambda>)->Map(add_features)->...->MapBatches(<lambda>) 3: 0.00 row [00:00, ? row/s]



In [10]:
len(df[
    (df['parasite_prediction'] is True) &
    (df['liver_prediction'] is False)
])

13230