In [1]:
from tdc.benchmark_group import admet_group
from rdkit import Chem 
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

from tdc.single_pred import ADME
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import Descriptors
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [2]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import GradientBoostingRegressor

### Copying submission method from MapLight 

https://github.com/maplightrx/MapLight-TDC/blob/main/submission.ipynb

In [3]:
benchmark_config = {
    'ppbr_az': ('regression', False),
}

In [4]:
group = admet_group(path = 'data/')

for admet_benchmark in [list(benchmark_config.keys())[0]]:
    predictions_list = []
    for seed in tqdm([1, 2, 3, 4, 5]):  
        benchmark = group.get(admet_benchmark)
        predictions = {}
        name = benchmark['name']
        training, test = benchmark['train_val'], benchmark['test']
        test['mol'] = test['Drug'].apply(lambda x: Chem.MolFromSmiles(x)) 
        training['mol'] = training['Drug'].apply(lambda x: Chem.MolFromSmiles(x)) 
    
        radius=1
        fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=2048) for mol in training['mol']]
        test_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=2048) for mol in test['mol']]
            
        num_molecules = len(fingerprints)
        num_test_molecules = len(test_fingerprints)
        similarity_matrix = np.zeros((num_molecules, num_molecules))
        test_similarity_matrix = np.zeros((num_test_molecules, num_molecules))
    
        for i in range(num_molecules):
            for j in range(i, num_molecules):
                similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity  # Symmetric matrix
        
        for i in range(num_test_molecules):
            for j in range(0, num_molecules):
                similarity = DataStructs.TanimotoSimilarity(test_fingerprints[i], fingerprints[j])
                test_similarity_matrix[i, j] = similarity
    
        X_train, X_test = similarity_matrix, test_similarity_matrix
    
        X_train_chem = np.array(pd.DataFrame([Descriptors.CalcMolDescriptors(x) for x in training['mol']]))
        X_test_chem = np.array(pd.DataFrame([Descriptors.CalcMolDescriptors(x) for x in test['mol']]))
        
        X_train = np.concatenate((X_train, X_train_chem), axis=1)
        X_test = np.concatenate((X_test, X_test_chem), axis=1)

        qt = QuantileTransformer(
            random_state=seed,
            output_distribution='normal',
                    )
        y_train, y_test = training['Y'].to_frame(), test['Y'].to_frame()
        y_train = qt.fit_transform(y_train)
        y_test = qt.transform(y_test)
        
        scaler = QuantileTransformer(
                random_state=seed,
                output_distribution='normal',

                )
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        regr = GradientBoostingRegressor(
            random_state=seed, 
            )
        regr.fit(X_train, y_train)
        y_pred = regr.predict(X_test).reshape(-1,1)
        y_pred_test = qt.inverse_transform(y_pred).reshape(-1)

    # --------------------------------------------- #
        prediction_dict = {name: y_pred_test}
        predictions_list.append(prediction_dict)
    results = group.evaluate_many(predictions_list)
    print('\n\n{}'.format(results))

Found local copy...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [10:39<00:00, 127.91s/it]



{'ppbr_az': [7.441, 0.024]}



