## Import and Configure Everything We Need

In [22]:
%matplotlib inline

from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import torch
import deepchem as dc

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation

# In many cases NaN
not_used_desc = ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge']

# Create a descriptor calculator for all RDKit descriptors except the ones above
desc_calc = MolecularDescriptorCalculator([x for x in [x[0] for x in Descriptors.descList] if x not in not_used_desc])

## Defining Cross-Validation Class

In [23]:
class CVRegressor:
    """
    Regressor that predicts based on predictions of k models from k-fold CV.
    Accepts any Scikit-learn-like regressor as base regressor. It trains k models
    by doing k-fold CV and stores the individual models. Predictions
    on new samples are done by calculating mean predictions from all models.
    
    Parameters
    ----------
    est : Any
        Scikit-learn (-like) regressor object. Must contain .fit() and .predict() methods.
    params : Dict[str, Any]
        Regressor parameters
    n_folds : int
        Number of folds for k-fold
    shuffle : bool
        Shuffling of data for CV
    """
    __slots__ = ('est', 'params', 'models', 'n_folds', 'shuffle', 'cv_scores')

    def __init__(self, est: Any, params: Dict[str, Any], n_folds: int = 5, shuffle: bool = True, num_epochs: int = 10):
        self.est = est
        self.params = params
        self.models = []
        self.n_folds = n_folds
        self.shuffle = shuffle
        self.cv_scores = ddict(list)
        self.num_epochs = num_epochs
        
    def train_func(self, model, x_data: torch.Tensor, y_data: torch.Tensor):
        dataset = torch.utils.data.TensorDataset(x_data, y_data)
        trainloader = torch.utils.data.DataLoader(dataset, batch_size=5)

        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

        for epoch in range(0, self.num_epochs):
            for i, data in enumerate(trainloader, 0):
                inputs, targets = data
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = loss_function(outputs, targets)
                loss.backward()
                optimizer.step()
        return model
            
    def fit(self, x_data: torch.Tensor, y_data: torch.Tensor, scoring_funcs: List=(), random_state: int=None) -> None:
        """
        Build a regressor consisting of k-models.
        
        Parameters
        ----------
        x_data : torch.tensor
            Training data
        y_data : torch.tensor
            Target values
        scoring_funcs : list
            List of scoring functions to use for evaluating cross-validation results
        random_state : int
            Integer to use for seeding the k-fold split
        """

        kf = KFold(n_splits=self.n_folds, shuffle=self.shuffle, random_state=random_state)
        kf = kf.split(X=x_data, y=y_data)

        # Fit k models and store them
        for train_index, test_index in kf:
            est_trained = train_func(self.est(**self.params), x_data[train_index], y_data[train_index])
            if scoring_funcs:
                test_pred = est_trained(x_data[test_index])
                for sf in scoring_funcs:
                    self.cv_scores[str(sf).split(' ')[1]].append(sf(y_data[test_index], test_pred))
            self.models.append(est_trained)

    def predict(self, x_data: torch.Tensor) -> np.ndarray:
        """
        Predict using prediction mean from k models.
        
        Parameters
        ----------
        x_data : torch.Tensor
            Samples to predict
        
        Returns
        -------
        numpy.ndarray
            Predicted values
        """

        return np.mean([m(x_data) for m in self.models], axis=0)

## Defining Helpful Functions

In [41]:
def rmse(y_true, y_pred):
    """Helper function"""
    return mean_squared_error(y_true, y_pred, squared=False)

def calc_stats_str(pka1, pka2):
    """Calculates R², MAE and RMSE for two iterables of floats or integers"""
    assert len(pka1) == len(pka2), "Both iterables must have the same length"
    return f'R²: {r2_score(pka1, pka2):.3f}\n' \
           f'MAE: {mean_absolute_error(pka1, pka2):.3f}\n' \
           f'RMSE: {rmse(pka1, pka2):.3f}'

def train_cv_model(est_cls, x_data, y_data, params, random_state,
                   cv=5, shuffle=True, scaled=False, scoring_funcs=(mean_absolute_error, rmse, r2_score)):
    """Scales the training data if wanted and trains a cross-validated model"""
    scaler = None
    if scaled:
        scaler = StandardScaler()
        x_data = scaler.fit_transform(x_data)
    cvr = CVRegressor(est=est_cls, params=params, n_folds=cv, shuffle=shuffle)
    cvr.fit(x_data, y_data, scoring_funcs=scoring_funcs, random_state=random_state)
    return cvr, scaler

def calc_x_data(solute,solvent):
    fmorgan3 = [] 
    featurizer = dc.feat.CircularFingerprint(size=4096, radius=3)
    X = featurizer.featurize(solute)
    Y = featurizer.featurize(solvent)
    fmorgan3 = torch.Tensor(np.concatenate((X,Y),axis=1))
    return fmorgan3

In [None]:
#MOLECULAR REPS
def mol_rep_data(sol_data,solv_data,rep):
    if rep == 'ECFP':
        featurizer = dc.feat.CircularFingerprint(size=1024, radius=3)
        sol_rep = featurizer.featurize(solute)
        solv_rep = featurizer.featurize(solvent)
    if rep == 'desc':
        featurizer = dc.feat.RDKitDescriptors()
        sol_rep = featurizer.featurize(solute)
        solv_rep = featurizer.featurize(solvent)
    if rep == 'mol2vec':
        #
    return sol_rep,solv_rep

---
## Loading Precombined Dataset

In [42]:
all_df = pd.read_csv('data/ETMdata.csv')
all_df['solute2'] = all_df.apply(lambda x: Chem.MolFromSmiles(x['solute']), axis=1)
all_df['solvent2'] = all_df.apply(lambda x: Chem.MolFromSmiles(x['solvent']), axis=1)
print(all_df['solute'][0])

CC1=C(C=C(C=C1)C(=O)O)C


## Calculating Descriptors and Fingerprints
- 196/200 RDKit descriptors
- Morgan FP with radius=3 and useFeatures=True (FMorgan3)

In [43]:
fmorgan3 = calc_x_data(all_df['solute'],all_df['solvent'])

In [44]:
print(fmorgan3.shape)

torch.Size([75, 8192])


---
## Training Random Forest, Support Vector Machine (two configurations) and Multilayer Perceptron (three configurations)
#### Using the following training sets with 5-fold cross-validation (shuffled)
1. RDKit descriptor set
2. FMorgan3
3. RDKit descriptor set + FMorgan3
4. RDKit descriptor set (standard scaled)
5. FMorgan3 (standard scaled)
6. RDKit descriptor set + FMorgan3 (standard scaled)

### Prepare for Training

In [45]:
seed = 24

est_jobs = 12
verbose = False

y_train = all_df['pka']
desc_sets = [fmorgan3]
'''
desc_sets = list(zip([descs, fmorgan3, descs_fmorgan3] * 2, 
                     [False] * 3 + [True] * 3, 
                     ['Desc', 'FMorgan3', 'Desc_FMorgan3', 'Desc_scaled', 'FMorgan3_scaled', 'Desc_FMorgan3_scaled']))
                     '''

models = ddict(odict)  # estimator => training set => [model, scaler]

def train_all_sets(est_cls, params, name):
    for x_data, scaled, set_name in desc_sets:
        models[name][set_name] = train_cv_model(est_cls, x_data, y_train, params, seed, scaled=scaled)

In [46]:
def generate_score_board(name):
    print(f'{name} CV Scores:')
    for ts, (m, s) in models[name].items():
        print(f'\t{ts}')
        for k, v in m.cv_scores.items():
            print(f'\t\t- {k}: {np.mean(v):.3f} ± {np.std(v):.3f}')


### RandomForest (n_estimators=1000)

In [9]:
est_cls = RandomForestRegressor
rf_params = dict(n_estimators=1000, n_jobs=est_jobs, verbose=verbose, random_state=seed)
name = 'RandomForest (n_estimators=1000)'

train_all_sets(est_cls, rf_params, name)

#### CV Scores

In [10]:
generate_score_board(name)

RandomForest (n_estimators=1000) CV Scores:
	Desc
		- mean_absolute_error: 0.718 ± 0.022
		- rmse: 1.077 ± 0.021
		- r2_score: 0.804 ± 0.010
	FMorgan3
		- mean_absolute_error: 0.708 ± 0.021
		- rmse: 1.094 ± 0.029
		- r2_score: 0.797 ± 0.008
	Desc_FMorgan3
		- mean_absolute_error: 0.683 ± 0.017
		- rmse: 1.032 ± 0.013
		- r2_score: 0.820 ± 0.005
	Desc_scaled
		- mean_absolute_error: 0.717 ± 0.022
		- rmse: 1.076 ± 0.022
		- r2_score: 0.804 ± 0.011
	FMorgan3_scaled
		- mean_absolute_error: 0.708 ± 0.021
		- rmse: 1.094 ± 0.029
		- r2_score: 0.797 ± 0.008
	Desc_FMorgan3_scaled
		- mean_absolute_error: 0.682 ± 0.017
		- rmse: 1.032 ± 0.013
		- r2_score: 0.820 ± 0.005


---
### SupportVectorMachine (gamma='scale')

In [11]:
est_cls = SVR
svr_params = dict(cache_size=4096, verbose=verbose)
name = 'SupportVectorMachine (gamma="scale")'

train_all_sets(est_cls, svr_params, name)

#### CV Scores

In [12]:
generate_score_board(name)

SupportVectorMachine (gamma="scale") CV Scores:
	Desc
		- mean_absolute_error: 2.100 ± 0.037
		- rmse: 2.436 ± 0.035
		- r2_score: -0.004 ± 0.004
	FMorgan3
		- mean_absolute_error: 0.851 ± 0.025
		- rmse: 1.240 ± 0.035
		- r2_score: 0.740 ± 0.012
	Desc_FMorgan3
		- mean_absolute_error: 2.100 ± 0.037
		- rmse: 2.436 ± 0.035
		- r2_score: -0.004 ± 0.004
	Desc_scaled
		- mean_absolute_error: 0.876 ± 0.033
		- rmse: 1.282 ± 0.047
		- r2_score: 0.722 ± 0.015
	FMorgan3_scaled
		- mean_absolute_error: 1.090 ± 0.034
		- rmse: 1.466 ± 0.041
		- r2_score: 0.637 ± 0.014
	Desc_FMorgan3_scaled
		- mean_absolute_error: 1.020 ± 0.037
		- rmse: 1.400 ± 0.047
		- r2_score: 0.668 ± 0.016


---
### SupportVectorMachine (gamma='auto')

In [13]:
est_cls = SVR
svr_params = dict(cache_size=4096, verbose=verbose, gamma='auto')
name = 'SupportVectorMachine (gamma="auto")'

train_all_sets(est_cls, svr_params, name)

#### CV Scores

In [14]:
generate_score_board(name)

SupportVectorMachine (gamma="auto") CV Scores:
	Desc
		- mean_absolute_error: 2.016 ± 0.042
		- rmse: 2.362 ± 0.039
		- r2_score: 0.056 ± 0.009
	FMorgan3
		- mean_absolute_error: 1.612 ± 0.031
		- rmse: 1.926 ± 0.033
		- r2_score: 0.373 ± 0.007
	Desc_FMorgan3
		- mean_absolute_error: 1.642 ± 0.061
		- rmse: 2.052 ± 0.060
		- r2_score: 0.288 ± 0.027
	Desc_scaled
		- mean_absolute_error: 0.882 ± 0.035
		- rmse: 1.288 ± 0.048
		- r2_score: 0.719 ± 0.016
	FMorgan3_scaled
		- mean_absolute_error: 1.090 ± 0.034
		- rmse: 1.465 ± 0.041
		- r2_score: 0.637 ± 0.014
	Desc_FMorgan3_scaled
		- mean_absolute_error: 1.019 ± 0.037
		- rmse: 1.400 ± 0.047
		- r2_score: 0.669 ± 0.016


---
### Multi Layer Perceptron (early_stopping=False, hidden_layer_sizes=(500, 500))

In [15]:
est_cls = MLPRegressor
mlp_params = dict(hidden_layer_sizes=(500, 500), verbose=verbose, random_state=seed)
name = 'Multi Layer Perceptron (early_stopping=False, hidden_layer_sizes=(500, 500))'

train_all_sets(est_cls, mlp_params, name)

#### CV Scores

In [16]:
generate_score_board(name)

Multi Layer Perceptron (early_stopping=False, hidden_layer_sizes=(500, 500)) CV Scores:
	Desc
		- mean_absolute_error: 1012633.635 ± 1901209.033
		- rmse: 21348705.163 ± 39788481.265
		- r2_score: -343815646750142.125 ± 686407858301744.875
	FMorgan3
		- mean_absolute_error: 0.866 ± 0.025
		- rmse: 1.270 ± 0.047
		- r2_score: 0.727 ± 0.019
	Desc_FMorgan3
		- mean_absolute_error: 338523.902 ± 425838.671
		- rmse: 7667645.861 ± 9828414.798
		- r2_score: -26414204010101.684 ± 48045329912146.383
	Desc_scaled
		- mean_absolute_error: 0.726 ± 0.018
		- rmse: 1.102 ± 0.050
		- r2_score: 0.794 ± 0.022
	FMorgan3_scaled
		- mean_absolute_error: 1.037 ± 0.045
		- rmse: 1.457 ± 0.057
		- r2_score: 0.640 ± 0.024
	Desc_FMorgan3_scaled
		- mean_absolute_error: 0.968 ± 0.032
		- rmse: 1.383 ± 0.040
		- r2_score: 0.677 ± 0.014


---
### Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(500, 500))

In [17]:
est_cls = MLPRegressor
mlp_params = dict(hidden_layer_sizes=(500, 500), verbose=verbose, random_state=seed, early_stopping=True)
name = 'Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(500, 500))'

train_all_sets(est_cls, mlp_params, name)

#### CV Scores

In [18]:
generate_score_board(name)

Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(500, 500)) CV Scores:
	Desc
		- mean_absolute_error: 1608.938 ± 553.820
		- rmse: 35009.764 ± 10750.094
		- r2_score: -227620357.862 ± 122941121.087
	FMorgan3
		- mean_absolute_error: 0.894 ± 0.024
		- rmse: 1.297 ± 0.040
		- r2_score: 0.715 ± 0.016
	Desc_FMorgan3
		- mean_absolute_error: 121502.590 ± 217072.292
		- rmse: 2884203.108 ± 5226274.774
		- r2_score: -5867135604712.889 ± 11651594167381.072
	Desc_scaled
		- mean_absolute_error: 0.768 ± 0.034
		- rmse: 1.161 ± 0.090
		- r2_score: 0.770 ± 0.038
	FMorgan3_scaled
		- mean_absolute_error: 1.031 ± 0.037
		- rmse: 1.447 ± 0.057
		- r2_score: 0.645 ± 0.026
	Desc_FMorgan3_scaled
		- mean_absolute_error: 0.984 ± 0.029
		- rmse: 1.404 ± 0.035
		- r2_score: 0.666 ± 0.017


---
### Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(250, 250, 250))

In [19]:
est_cls = MLPRegressor
mlp_params = dict(hidden_layer_sizes=(250, 250, 250), verbose=verbose, random_state=seed, early_stopping=True)
name = 'Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(250, 250, 250))'

train_all_sets(est_cls, mlp_params, name)

#### CV Scores

In [20]:
generate_score_board(name)

Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(250, 250, 250)) CV Scores:
	Desc
		- mean_absolute_error: 342.263 ± 353.360
		- rmse: 7272.756 ± 7016.493
		- r2_score: -18027644.153 ± 27054677.449
	FMorgan3
		- mean_absolute_error: 0.869 ± 0.023
		- rmse: 1.265 ± 0.039
		- r2_score: 0.729 ± 0.016
	Desc_FMorgan3
		- mean_absolute_error: 710.157 ± 585.744
		- rmse: 16128.376 ± 13756.380
		- r2_score: -74358553.117 ± 84585812.796
	Desc_scaled
		- mean_absolute_error: 0.775 ± 0.008
		- rmse: 1.158 ± 0.033
		- r2_score: 0.773 ± 0.013
	FMorgan3_scaled
		- mean_absolute_error: 1.026 ± 0.038
		- rmse: 1.455 ± 0.053
		- r2_score: 0.642 ± 0.022
	Desc_FMorgan3_scaled
		- mean_absolute_error: 0.973 ± 0.035
		- rmse: 1.388 ± 0.053
		- r2_score: 0.674 ± 0.023


---
### XGradientBoost

In [21]:
est_cls = xgb.XGBRegressor
xgb_params = dict(verbosity=2 if verbose else 0, random_state=seed, n_jobs=est_jobs)
name = 'XGradientBoost'

train_all_sets(est_cls, xgb_params, name)

#### CV Scores

In [22]:
generate_score_board(name)

XGradientBoost CV Scores:
	Desc
		- mean_absolute_error: 1.020 ± 0.014
		- rmse: 1.353 ± 0.021
		- r2_score: 0.691 ± 0.007
	FMorgan3
		- mean_absolute_error: 1.094 ± 0.027
		- rmse: 1.423 ± 0.036
		- r2_score: 0.657 ± 0.011
	Desc_FMorgan3
		- mean_absolute_error: 1.018 ± 0.010
		- rmse: 1.346 ± 0.022
		- r2_score: 0.694 ± 0.005
	Desc_scaled
		- mean_absolute_error: 1.020 ± 0.014
		- rmse: 1.353 ± 0.021
		- r2_score: 0.691 ± 0.007
	FMorgan3_scaled
		- mean_absolute_error: 1.094 ± 0.027
		- rmse: 1.423 ± 0.036
		- r2_score: 0.657 ± 0.011
	Desc_FMorgan3_scaled
		- mean_absolute_error: 1.018 ± 0.010
		- rmse: 1.346 ± 0.022
		- r2_score: 0.694 ± 0.005
