In [2]:
import pandas as pd
import deepchem as dc
import numpy as np

from rdkit import Chem
from rdkit.Chem import Descriptors


from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error, r2_score

from matplotlib import pyplot as plt

import joblib
import os

from data_transformers import SMILESTransformer

2024-03-12 12:20:53.553401: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


In [3]:
#select properties of interest
loaders = {'solubility_delaney':dc.molnet.load_delaney, 'lipophilicity_lipo': dc.molnet.load_lipo}

In [6]:
#model type to use for prediction
model = 'rf'

#loop over properties
for l, loader in loaders.items():
    
    print('dataset:',l)
    
    prop = l.split('_')[0]
    dataset = l.split('_')[1]
    
    print('Loading...')
    tasks, datasets, transformers = loader(reload=False)
    (train, val, test) = datasets

    #scikit learn model
    if model == 'rf':
        mdl = RandomForestRegressor()    
    
    
    #pipeline to compute descriptors from SMILES and fit a regression model
    pipe = Pipeline(steps=[
                       ('descriptors', SMILESTransformer()),    
                       ('reg', mdl)
                ])


    #fit model
    print('Training...')
    pipe.fit(train.ids, train.y)

    #predict properties
    print('Predicting...')
    ytrain_pred = pipe.predict(train.ids)
    yval_pred = pipe.predict(val.ids)
    ytest_pred = pipe.predict(test.ids)

    #create dataframes of SMILES, true values, and predictions
    train_df = pd.DataFrame({'SMILES':train.ids,
                            'label':train.y.flatten(),
                            'pred':ytrain_pred})

    val_df = pd.DataFrame({'SMILES':val.ids,
                            'label':val.y.flatten(),
                            'pred':yval_pred})
    test_df = pd.DataFrame({'SMILES':test.ids,
                            'label':test.y.flatten(),
                            'pred':ytest_pred})

    #print metrics
    print(f"RMSE: {np.sqrt(mean_squared_error(test.y, ytest_pred))}, R2: {r2_score(test.y, ytest_pred)}\n")
    
    #create file structure for property data
    path = 'property_data'
    for dir in ['',prop, dataset, model, 'model']:
        path += f'/{dir}'
        if not os.path.exists(path):
            os.mkdir(path)
    
    outpath = f'property_data/{prop}/{dataset}/{model}/'
        
    print('Saving...')
    #save data
    train_df.to_csv(f'{outpath}/train.csv',index=False)
    val_df.to_csv(f'{outpath}/val.csv',index=False)
    test_df.to_csv(f'{outpath}/test.csv',index=False)
    
    #save trained model
    joblib.dump(pipe, f'{outpath}/model/{model}_{prop}_model.joblib')
    

dataset: solubility_delaney
Loading...
Training...


  return fit_method(estimator, *args, **kwargs)


Predicting...
RMSE: 0.43725919024799625, R2: 0.8182304607806024

Saving...
dataset: lipophilicity_lipo
Loading...
Training...


  return fit_method(estimator, *args, **kwargs)


Predicting...
RMSE: 0.656741908331252, R2: 0.4809978242470512

Saving...
