In [1]:
import polaris as po
import numpy as np
from rdkit import Chem
import pandas as pd
import random
import pickle
import datamol as dm
from molfeat.trans.pretrained import PretrainedDGLTransformer
from molfeat.trans import MoleculeTransformer

  from .autonotebook import tqdm as notebook_tqdm


### Load data and benchmark

In [2]:
benchmark = po.load_benchmark("polaris/pkis1-kit-wt-mut-c-1")
train, test = benchmark.get_train_test_split(featurization_fn=dm.to_fp)

[32m2024-06-21 13:32:15.944[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m
[32m2024-06-21 13:32:15.950[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m


In [3]:
### Get molecule info, instead of Fingerprint

#train_mol, test_mol = benchmark.get_train_test_split(featurization_fn=dm.to_mol)

### Convert to smiles strings

In [4]:
# smiles = []
# test_smiles = []

# for mol in train_mol.X:
#     smiles.append(Chem.MolToSmiles(mol))
    
# for mol in test_mol.X:
#     test_smiles.append(Chem.MolToSmiles(mol))
    
# smiles      = pd.Series(smiles)
# test_smiles = pd.Series(test_smiles)

In [5]:
# test_smiles

### Featurize

In [6]:
# transformer = MoleculeTransformer(featurizer='mordred', dtype=float)

# X_features     = transformer(smiles)
# Xtest_features = transformer(test_smiles)

In [7]:
## There might be nan values, RF will handle these

In [8]:
X_features_imputed     = train.X #X_features
Xtest_features_imputed = test.X #Xtest_features

In [9]:
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

models = {target: HistGradientBoostingClassifier() for target in benchmark.target_cols}
X = train.X

for target, model in models.items():
  y = train.y[target]
  mask = ~np.isnan(y)
  model.fit(X_features_imputed[mask], y[mask])

y_prob = {target: model.predict_proba(Xtest_features_imputed)[:, 1] for target, model in models.items()}
y_pred = {target: model.predict(Xtest_features_imputed) for target, model in models.items()}

results = benchmark.evaluate(y_pred=y_pred, y_prob=y_prob)

In [10]:
results

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.9310344828
test,CLASS_KIT_(V560G_mutant),accuracy,0.8620689655
test,CLASS_KIT,accuracy,0.6896551724
test,CLASS_KIT_(T6701_mutant),f1,0.7272727273
test,CLASS_KIT_(V560G_mutant),f1,0.1428571429
test,CLASS_KIT,f1,0.4905660377
test,CLASS_KIT_(T6701_mutant),roc_auc,0.7818003914
test,CLASS_KIT_(V560G_mutant),roc_auc,0.6933333333
test,CLASS_KIT,roc_auc,0.8187429854
test,CLASS_KIT_(T6701_mutant),pr_auc,0.7160258127

0,1
slug,polaris
external_id,org_2gtoaJIVrgRqiIR8Qm5BnpFCbxu
type,organization

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.9310344828
test,CLASS_KIT_(V560G_mutant),accuracy,0.8620689655
test,CLASS_KIT,accuracy,0.6896551724
test,CLASS_KIT_(T6701_mutant),f1,0.7272727273
test,CLASS_KIT_(V560G_mutant),f1,0.1428571429
test,CLASS_KIT,f1,0.4905660377
test,CLASS_KIT_(T6701_mutant),roc_auc,0.7818003914
test,CLASS_KIT_(V560G_mutant),roc_auc,0.6933333333
test,CLASS_KIT,roc_auc,0.8187429854
test,CLASS_KIT_(T6701_mutant),pr_auc,0.7160258127


In [11]:
with open(f"T25_KIT_WT_T6701_pred.pickle", 'wb') as handle:
    pickle.dump(y_pred, handle)
    
with open(f"T25_KIT_WT_T6701_prob.pickle", 'wb') as handle:
    pickle.dump(y_prob, handle)

In [12]:
y_prob

{'CLASS_KIT_(T6701_mutant)': array([6.96396067e-05, 2.08252784e-04, 2.33169443e-06, 6.68561408e-04,
        1.05360809e-03, 3.20995032e-06, 2.49468393e-04, 3.02728730e-03,
        2.05598002e-04, 1.88392418e-06, 1.23246734e-03, 3.28950701e-06,
        1.00831730e-06, 1.25245750e-03, 1.67796448e-05, 1.69976669e-01,
        1.77409142e-04, 8.62322610e-03, 6.96576760e-04, 9.03549202e-06,
        1.04587985e-04, 3.78654715e-04, 2.04230634e-06, 5.37012973e-01,
        3.72483645e-06, 7.82994371e-05, 2.25479765e-03, 8.67387462e-05,
        2.82153570e-06, 3.44877816e-05, 2.17191103e-02, 1.76800301e-06,
        9.72919897e-01, 1.34195758e-04, 1.10722132e-03, 4.68791108e-04,
        8.45449951e-05, 4.48476480e-05, 5.55316727e-04, 7.52486324e-05,
        1.80977720e-05, 9.29763156e-01, 3.17391264e-05, 1.34195758e-04,
        1.38665266e-01, 5.07643012e-06, 2.85607536e-05, 1.12695048e-04,
        1.54400840e-06, 6.25081202e-06, 7.98468389e-06, 6.29474791e-01,
        8.41895550e-06, 3.20284818e-

In [13]:
y_pred

{'CLASS_KIT_(T6701_mutant)': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 'CLASS_KIT_(V560G_mutant)': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 'CLASS_KIT': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,