In [1]:
import polaris as po
import numpy as np
from rdkit import Chem
import pandas as pd

import datamol as dm
from molfeat.trans.pretrained import PretrainedDGLTransformer
from molfeat.trans import MoleculeTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import random
import pickle

### Load data and benchmark

In [3]:
benchmark = po.load_benchmark("polaris/pkis1-kit-wt-mut-c-1")
train, test = benchmark.get_train_test_split(featurization_fn=dm.to_fp)

[32m2024-06-21 12:58:20.042[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m
[32m2024-06-21 12:58:20.054[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m


In [4]:
### Get molecule info, instead of Fingerprint

train_mol, test_mol = benchmark.get_train_test_split(featurization_fn=dm.to_mol)

### Convert to smiles strings

In [5]:
smiles = []
test_smiles = []

for mol in train_mol.X:
    smiles.append(Chem.MolToSmiles(mol))
    
for mol in test_mol.X:
    test_smiles.append(Chem.MolToSmiles(mol))
    
smiles      = pd.Series(smiles)
test_smiles = pd.Series(test_smiles)

In [6]:
test_smiles

0                     O=C1NC(=O)C(c2ccccc2)=C1Nc1ccccc1
1      NS(=O)(=O)c1ccc(Nc2cc(-c3ccc(O)c(F)c3)n[nH]2)cc1
2     CC1(C)CC(n2cnc(-c3ccc(F)cc3)c2-c2ccnc(N)n2)CC(...
3     Nc1nccc(C#Cc2cncnc2Nc2ccc(OCc3cccc(F)c3)c(Cl)c...
4                     COc1ccc(Nc2ncc(-c3ccccc3)o2)cc1OC
                            ...                        
82    O=C1NC(=O)C(c2cccc(Cl)c2)=C1Nc1ccc(Cl)c(C(=O)O)c1
83            O=C(Nc1n[nH]c2nc(-c3ccco3)c(Br)cc12)C1CC1
84    CCn1cc(-c2ccnc3[nH]ccc23)c(-c2cccc(NC(=O)Nc3cc...
85    CCOc1ccc2c(-c3ccnc(Nc4cc(OC)cc(C(F)(F)F)c4)n3)...
86                        c1ccc(Nc2ncc(-c3ccccc3)o2)cc1
Length: 87, dtype: object

### Featurize

In [7]:
transformer = MoleculeTransformer(featurizer='mordred', dtype=float)

X_features     = transformer(smiles)
Xtest_features = transformer(test_smiles)

In [8]:
## There might be nan values, RF will handle these

In [9]:
X_features_imputed     = X_features
Xtest_features_imputed = Xtest_features

In [10]:
from sklearn.ensemble import RandomForestClassifier

models = {target: RandomForestClassifier(max_depth=None) for target in benchmark.target_cols}
X = train.X

for target, model in models.items():
  y = train.y[target]
  mask = ~np.isnan(y)
  model.fit(X_features_imputed[mask], y[mask])

y_prob = {target: model.predict_proba(Xtest_features_imputed)[:, 1] for target, model in models.items()}
y_pred = {target: model.predict(Xtest_features_imputed) for target, model in models.items()}

results = benchmark.evaluate(y_pred=y_pred, y_prob=y_prob)

In [11]:
results

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.8390804598
test,CLASS_KIT_(V560G_mutant),accuracy,0.8620689655
test,CLASS_KIT,accuracy,0.6206896552
test,CLASS_KIT_(T6701_mutant),f1,0.0
test,CLASS_KIT_(V560G_mutant),f1,0.0
test,CLASS_KIT,f1,0.0
test,CLASS_KIT_(T6701_mutant),roc_auc,0.5807240705
test,CLASS_KIT_(V560G_mutant),roc_auc,0.9005555556
test,CLASS_KIT,roc_auc,0.8821548822
test,CLASS_KIT_(T6701_mutant),pr_auc,0.3409586009

0,1
slug,polaris
external_id,org_2gtoaJIVrgRqiIR8Qm5BnpFCbxu
type,organization

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.8390804598
test,CLASS_KIT_(V560G_mutant),accuracy,0.8620689655
test,CLASS_KIT,accuracy,0.6206896552
test,CLASS_KIT_(T6701_mutant),f1,0.0
test,CLASS_KIT_(V560G_mutant),f1,0.0
test,CLASS_KIT,f1,0.0
test,CLASS_KIT_(T6701_mutant),roc_auc,0.5807240705
test,CLASS_KIT_(V560G_mutant),roc_auc,0.9005555556
test,CLASS_KIT,roc_auc,0.8821548822
test,CLASS_KIT_(T6701_mutant),pr_auc,0.3409586009


In [12]:
str_names = [str(x) for x  in results.results['Metric']]

idxs = [i for i, x in enumerate(str_names) if 'pr_auc' in x]
sub_df = results.results.iloc[idxs]
test_names = list(sub_df['Target label'])
test_names = [x + f' ({tt})' for tt, x in enumerate(test_names)]
test_values = list(sub_df['Score'])
test_values

[0.3409586008876442, 0.7571248196248197, 0.8178912856378528]

In [16]:
with open(f"T25_KIT_WT_V560G_pred.pickle", 'wb') as handle:
    pickle.dump(y_pred, handle)
    
with open(f"T25_KIT_WT_V560G_prob.pickle", 'wb') as handle:
    pickle.dump(y_prob, handle)