In [14]:
import numpy
import random

import getml

getml.engine.set_project('biodegradability')

Loading existing project 'biodegradability'


Establish a connection to the remote MariaDB database.

In [8]:
getml.database.connect_mysql(
    host="relational.fit.cvut.cz",
    port=3306,
    dbname="Biodegradability",
    user="guest",
    password="relational",
    time_formats=['%Y/%m/%d']
)


Load all required tables and upload them into the getML engine.

In [25]:
atom = getml.data.DataFrame.from_db('atom','atom')
bond = getml.data.DataFrame.from_db('bond','bond')
gmember = getml.data.DataFrame.from_db('gmember','gmember')
group = getml.data.DataFrame.from_db('group','group')
molecule = getml.data.DataFrame.from_db('molecule','molecule')

Annotate the loaded data.

In [26]:
molecule.set_role('molecule_id', getml.data.roles.join_key)
molecule.set_role('mweight', getml.data.roles.numerical)
molecule.set_role('activity', getml.data.roles.numerical)
molecule.set_role('logp', getml.data.roles.target)

atom.set_role('atom_id', getml.data.roles.join_key)
atom.set_role('molecule_id', getml.data.roles.join_key)
atom.set_role('type', getml.data.roles.categorical)

bond.set_role('atom_id', getml.data.roles.join_key)
bond.set_role('atom_id2', getml.data.roles.join_key)
bond.set_role('type', getml.data.roles.categorical)

gmember.set_role('atom_id', getml.data.roles.join_key)
gmember.set_role('group_id', getml.data.roles.join_key)

group.set_role('group_id', getml.data.roles.join_key)
group.set_role('type', getml.data.roles.categorical)

Split the molecule data table into training, validation, and test set.

In [29]:
percentage_training = 0.5
percentage_validation = 0.25
# Add a column containing a shuffled version of the row numbers used to split the data at random.
molecule.add(numpy.array(random.sample(range(0, molecule.shape[0]), k=molecule.shape[0])), 'index')

molecule_training = molecule.where('molecule_training', 
                                   molecule['index'] < molecule.shape[0]*percentage_training)
molecule_validation = molecule.where('molecule_validation',
                                     (molecule['index'] > molecule.shape[0]*percentage_training) &
                                   (molecule['index'] < molecule.shape[0]*(percentage_training+percentage_validation)))
molecule_testing = molecule.where('molecule_testing', 
                                   molecule['index'] > molecule.shape[0]*(percentage_training+percentage_validation))

Create the data model.

In [30]:
ph_molecule = molecule.to_placeholder()
ph_atom = atom.to_placeholder()
ph_bond = bond.to_placeholder()
ph_gmember = gmember.to_placeholder()
ph_group = group.to_placeholder()

ph_molecule.join(
    ph_atom,
    join_key = 'molecule_id'
)
ph_atom.join(
    ph_bond,
    join_key = 'atom_id'
)
ph_atom.join(
    ph_bond,
    join_key = 'atom_id',
    other_join_key = 'atom_id2'
)
ph_atom.join(
    ph_gmember,
    join_key = 'atom_id'
)
ph_gmember.join(
    ph_group,
    join_key = 'group_id'
)

Construct the feature engineerer, feature selector, and predictor.

In [35]:
feature_selector = getml.predictors.XGBoostRegressor(
    booster = 'gblinear',
    n_estimators = 60,
    n_jobs = 6,
    max_depth = 7,
    reg_lambda = 500
)

predictor = getml.predictors.XGBoostRegressor(
    booster = 'gblinear',
    n_estimators = 60,
    n_jobs = 6,
    max_depth = 7,
    reg_lambda = 500
)

## -------------------------------------------------------------------

## Construct the base model.
base_model = getml.models.RelboostModel(
    name = 'base',
    population = ph_molecule,
    peripheral = [ph_atom, ph_gmember, ph_group, ph_bond],
    loss_function = getml.models.loss_functions.SquareLoss(),
    num_features = 50,
    num_subfeatures = 10,
    feature_selector = feature_selector,
    predictor = predictor,
    num_threads = 3
).send()


Perform the initial training

In [39]:
base_model.fit(population_table=molecule_training,
              peripheral_tables=[atom, gmember, group, bond])

Loaded data. Features are now being trained...
Trained model.
Time taken: 0h:0m:23.09367



RelboostModel:
  type: RelboostModel
  allow_null_weights: False
  delta_t: 0
  feature_selector: XGBoostRegressor:
    type: XGBoostRegressor
    booster: gblinear
    colsample_bylevel: 1
    colsample_bytree: 1
    learning_rate: 0.1
    gamma: 0
    max_delta_step: 0
    max_depth: 7
    min_child_weights: 1
    n_estimators: 60
    normalize_type: tree
    num_parallel_tree: 1
    n_jobs: 6
    objective: reg:squarederror
    one_drop: False
    rate_drop: 0
    reg_alpha: 0
    reg_lambda: 500
    sample_type: uniform
    silent: True
    skip_drop: 0
    subsample: 1
  gamma: 0
  include_categorical: False
  loss_function: SquareLoss
  max_depth: 3
  min_num_samples: 1
  name: base
  num_features: 50
  num_subfeatures: 10
  num_threads: 3
  peripheral (list):
    Placeholder:
      name: atom
      categorical: ['type']
      numerical: []
      join_keys: ['atom_id', 'molecule_id']
      targets: []
      time_stamps: []
      join_keys_used: []
      other_join_keys_used: []
 

In [40]:
base_model.score(population_table=molecule_training,
                peripheral_tables=[atom, gmember, group, bond])

{'mae': [1.2180083494651608],
 'rmse': [1.586045027671965],
 'rsquared': [0.5255422957520174]}

Perform a Gaussian hyperparameter optimization to find the best possible set of hyperparameters.

In [43]:
## Build a parameter space to search in
param_space = dict()

param_space["max_depth"] = [1, 10]
param_space["min_num_samples"] = [10, 100]
param_space["num_features"] = [10, 100]
param_space["reg_lambda"] = [0.01, 0.3]
param_space["share_selected_features"] = [0.3, 1.0]
param_space["shrinkage"] = [0.01, 0.4]

# Any hyperparameters that relate to the predictor
# are preceded by "predictor_".
param_space["predictor_n_estimators"] = [40, 140]
param_space["predictor_max_depth"] = [3, 15]
param_space["predictor_reg_lambda"] = [0.0, 1000.0]

## -------------------------------------------------------------------

## Start the hyperparameter optimization.
gauss_search = getml.hyperopt.GaussianHyperparameterSearch(
    model = base_model,
    param_space = param_space,
    n_iter = 120,
    ratio_iter = 0.8
)

gauss_search.fit(
  population_table_training = molecule_training,
  population_table_validation = molecule_validation,
  peripheral_tables = [atom, gmember, group, bond]
)


Launched hyperparameter optimization...


In [47]:
res_scores = gauss_search.get_scores()

res_scores

{'2020-03-09T17-53-00-hyperopt-gaussian-relboost-001': {'accuracy': [],
  'auc': [],
  'cross_entropy': [],
  'mae': [1.1655486180458532],
  'rmse': [1.5158329396449357],
  'rsquared': [0.4722639088313854]},
 '2020-03-09T17-53-00-hyperopt-gaussian-relboost-002': {'accuracy': [],
  'auc': [],
  'cross_entropy': [],
  'mae': [1.269978395332524],
  'rmse': [1.6272371019386307],
  'rsquared': [0.4340706243773536]},
 '2020-03-09T17-53-00-hyperopt-gaussian-relboost-003': {'accuracy': [],
  'auc': [],
  'cross_entropy': [],
  'mae': [1.2488222667906013],
  'rmse': [1.6003441002819223],
  'rsquared': [0.46596461952705653]},
 '2020-03-09T17-53-00-hyperopt-gaussian-relboost-004': {'accuracy': [],
  'auc': [],
  'cross_entropy': [],
  'mae': [1.1903429721608567],
  'rmse': [1.543199832806225],
  'rsquared': [0.4315008038432993]},
 '2020-03-09T17-53-00-hyperopt-gaussian-relboost-005': {'accuracy': [],
  'auc': [],
  'cross_entropy': [],
  'mae': [1.1872036668989385],
  'rmse': [1.5376077827976145]