# 01.00 Basic random forest

* This model only uses sequences to make prediction.
* This model uses random forest mode of LightGBM.

In [1]:
import copy
import lzma
import pickle
import warnings
warnings.filterwarnings('ignore', 'The least populated class')
warnings.filterwarnings('ignore', 'categorical_feature in Dataset is overridden')
warnings.filterwarnings('ignore', 'object name is not a valid Python identifier')

import hyperopt
import lightgbm
import numpy
import pandas
import scipy.stats
import sklearn.metrics
import sklearn.model_selection
import tables

import utils

## Materials

* Training sequences, their fitness values, and the numbers of mutations (for stratified cross-validation).
* Validation sequences and their fitness values for evaluation, results of which are kept secret for now.

In [2]:
x = utils.data.load_sequence('training', one_hot=False)
y = utils.data.load('training', 'fitness')
mutation_count = utils.data.load('training', 'mutation_count')
fitness_group = utils.data.load('training', 'fitness_group')
groups = mutation_count * (fitness_group.max() + 1) + fitness_group

In [3]:
validation_x = utils.data.load_sequence('validation', one_hot=False)
validation_y = utils.data.average_fitness(utils.data.load('validation', 'fitness'))

## Methods

* Search for the best hyperparameters for the LightGBM model.
* Retrain the LightGBM model with the best hyperparameters over the pre-defined cross-validation splits, and score them.
* Retrain the LightGBM model over the pre-defined stacking splits, and predict the folds.
* Retrain the LightGBM model over the whole training set, predict the leaderboard set, and score it.
* Keep leaderboard scores secret for now.

In [4]:
def search_models(arguments):
    kfold = sklearn.model_selection.RepeatedStratifiedKFold(n_splits=2, n_repeats=5)
    score = []
    parameters = copy.deepcopy(arguments)
    parameters['num_leaves'] = int(parameters['num_leaves'])
    parameters['n_estimators'] = int(parameters['n_estimators'])
    for train_idx, test_idx in kfold.split(x, groups):
        model = lightgbm.LGBMRegressor(boosting_type='rf', subsample_freq=1, **parameters)
        train_x = numpy.repeat(x[train_idx, :], utils.data.REPLICA_COUNT, axis=0)
        train_y = y[train_idx, :].flatten()
        model.fit(train_x, train_y, categorical_feature=list(range(utils.data.SEQUENCE_LENGTH)))
        test_x = x[test_idx, :]
        prediction = model.predict(test_x)
        test_y = utils.data.average_fitness(y[test_idx, :])
        score.append(utils.metrics.minimization_metric(test_y, prediction))
    score = sum(score) / len(score)
    return score


parameter_space = {
    'num_leaves': 2 ** hyperopt.hp.quniform('num_leaves', 4, 12, 2) - 1,
    'n_estimators': 2 ** hyperopt.hp.quniform('n_estimators', 0, 4, 1) * 50,
    'colsample_bytree': hyperopt.hp.quniform('colsample_bytree', 0.1, 0.9, 0.1),
    'subsample': hyperopt.hp.quniform('subsample', 0, 0.8, 0.2) + 0.1,
}
try:
    trials = pickle.load((utils.data.path(2) / '00.hyperopt_trials.pickle').open('rb'))
except:
    trials = hyperopt.Trials()
try:
    hyperopt.fmin(search_models, space=parameter_space, trials=trials,
                         algo=hyperopt.tpe.suggest, max_evals=100)
finally:
    trials.refresh()
    pickle.dump(trials, (utils.data.path(2) / '00.hyperopt_trials.pickle').open('wb'))

In [5]:
trials = pickle.load(open('../../data/02.LightGBM_models/00.hyperopt_trials.pickle', 'rb'))

**NOTE**: Recover the parameters back to their correct scales.

In [6]:
best = trials.argmin
parameters = copy.deepcopy(best)
parameters['num_leaves'] = int(2 ** parameters['num_leaves'] - 1)
parameters['n_estimators'] = int(2 ** parameters['n_estimators'] * 50)
parameters['subsample'] = parameters['subsample'] + 0.1
model_file = tables.open_file(utils.data.path(2) / '00.model.h5', 'w')

In [7]:
scores = []
for train_idx in utils.data.predefined_cross_validation():
    test_idx = ~train_idx
    model = lightgbm.LGBMRegressor(boosting_type='rf', subsample_freq=1, **parameters)
    train_x = numpy.repeat(x[train_idx, :], utils.data.REPLICA_COUNT, axis=0)
    train_y = y[train_idx, :].flatten()
    model.fit(train_x, train_y, categorical_feature=list(range(utils.data.SEQUENCE_LENGTH)))
    test_x = x[test_idx, :]
    prediction = model.predict(test_x).clip(min=1e-7)
    test_y = utils.data.average_fitness(y[test_idx, :])
    scores.append(utils.metrics.evaluation_metrics(test_y, prediction))
scores = pandas.concat(scores, axis=1).T
model_file.create_table('/', 'cv_scores', obj=scores.to_records(index=False))

/cv_scores (Table(10,)) ''
  description := {
  "Pearson": Float64Col(shape=(), dflt=0.0, pos=0),
  "Log-pearson": Float64Col(shape=(), dflt=0.0, pos=1),
  "Spearman": Float64Col(shape=(), dflt=0.0, pos=2),
  "Kendall": Float64Col(shape=(), dflt=0.0, pos=3),
  "MSE": Float64Col(shape=(), dflt=0.0, pos=4),
  "Log-MSE": Float64Col(shape=(), dflt=0.0, pos=5),
  "AUC 0.5": Float64Col(shape=(), dflt=0.0, pos=6),
  "AUC 1": Float64Col(shape=(), dflt=0.0, pos=7)}
  byteorder := 'little'
  chunkshape := (1024,)

In [8]:
prediction = numpy.ndarray(y.shape[0], dtype='f4')
prediction[:] = float('nan')
for train_idx in utils.data.stacking_splits():
    test_idx = ~train_idx
    model = lightgbm.LGBMRegressor(boosting_type='rf', subsample_freq=1, **parameters)
    train_x = numpy.repeat(x[train_idx, :], utils.data.REPLICA_COUNT, axis=0)
    train_y = y[train_idx, :].flatten()
    model.fit(train_x, train_y, categorical_feature=list(range(utils.data.SEQUENCE_LENGTH)))
    test_x = x[test_idx, :]
    prediction[test_idx] = model.predict(test_x).clip(min=1e-7)
model_file.create_array('/', 'stack_samples', obj=prediction)

/stack_samples (Array(49153,)) ''
  atom := Float32Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [9]:
prediction = numpy.ndarray(y.shape[0], dtype='f4')
model = lightgbm.LGBMRegressor(boosting_type='rf', subsample_freq=1, **parameters)
full_x = numpy.repeat(x, utils.data.REPLICA_COUNT, axis=0)
model.fit(full_x, y.flatten(), categorical_feature=list(range(utils.data.SEQUENCE_LENGTH)))
with lzma.LZMAFile(utils.data.path(2) / '00.training_model.pickle.xz', 'w') as xz:
    pickle.dump(model, xz)
prediction = model.predict(validation_x).clip(min=1e-7)
model_file.create_array('/', 'validation_samples', obj=prediction)
score = utils.metrics.evaluation_metrics(validation_y, prediction)
score = pandas.DataFrame(score).T
model_file.create_table('/', 'validation_score', obj=score.to_records(index=False))

/validation_score (Table(1,)) ''
  description := {
  "Pearson": Float64Col(shape=(), dflt=0.0, pos=0),
  "Log-pearson": Float64Col(shape=(), dflt=0.0, pos=1),
  "Spearman": Float64Col(shape=(), dflt=0.0, pos=2),
  "Kendall": Float64Col(shape=(), dflt=0.0, pos=3),
  "MSE": Float64Col(shape=(), dflt=0.0, pos=4),
  "Log-MSE": Float64Col(shape=(), dflt=0.0, pos=5),
  "AUC 0.5": Float64Col(shape=(), dflt=0.0, pos=6),
  "AUC 1": Float64Col(shape=(), dflt=0.0, pos=7)}
  byteorder := 'little'
  chunkshape := (1024,)

In [10]:
model_file.close()

## Results

Best parameters

In [11]:
trials.argmin

{'colsample_bytree': 0.5,
 'n_estimators': 4.0,
 'num_leaves': 12.0,
 'subsample': 0.4}

In [12]:
sorted(trials.trials, key=lambda x: x['result']['loss'])[:20]

[{'state': 2,
  'tid': 80,
  'spec': None,
  'result': {'loss': -0.4250860758556471, 'status': 'ok'},
  'misc': {'tid': 80,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'colsample_bytree': [80],
    'n_estimators': [80],
    'num_leaves': [80],
    'subsample': [80]},
   'vals': {'colsample_bytree': [0.5],
    'n_estimators': [4.0],
    'num_leaves': [12.0],
    'subsample': [0.4]}},
  'exp_key': None,
  'owner': None,
  'version': 0,
  'book_time': datetime.datetime(2018, 12, 28, 9, 34, 2, 208000),
  'refresh_time': datetime.datetime(2018, 12, 28, 9, 44, 55, 378000)},
 {'state': 2,
  'tid': 78,
  'spec': None,
  'result': {'loss': -0.4247629728504327, 'status': 'ok'},
  'misc': {'tid': 78,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'colsample_bytree': [78],
    'n_estimators': [78],
    'num_leaves': [78],
    'subsample': [78]},
   'vals': {'colsample_bytree': [0.5],
    'n_estimators': [4.0],
    'num_le

Cross validation scores

In [13]:
mean = pandas.DataFrame(scores.mean()).T
mean.index = ['Mean']
std = pandas.DataFrame(scores.std()).T
std.index = ['Std']
pandas.concat([scores, mean, std], axis=0)

Unnamed: 0,Pearson,Log-pearson,Spearman,Kendall,MSE,Log-MSE,AUC 0.5,AUC 1
0,0.634268,0.638934,0.56272,0.426156,0.01176,0.024777,0.759271,0.937929
1,0.637521,0.641324,0.559988,0.424735,0.011714,0.024707,0.756715,0.949671
2,0.62942,0.633688,0.555587,0.420523,0.011681,0.024682,0.755384,0.940757
3,0.643429,0.647255,0.567964,0.430983,0.011789,0.024802,0.761624,0.944131
4,0.642173,0.646228,0.565098,0.429015,0.011919,0.025001,0.759754,0.942369
5,0.633743,0.637582,0.558038,0.422306,0.011525,0.024437,0.756336,0.946079
6,0.63625,0.640027,0.560683,0.424816,0.011692,0.024661,0.757988,0.945593
7,0.641913,0.645595,0.563116,0.426933,0.011735,0.024737,0.758744,0.943348
8,0.636748,0.641418,0.564254,0.427457,0.011868,0.024949,0.759133,0.938265
9,0.639417,0.642471,0.558992,0.423972,0.011567,0.02447,0.75725,0.950161
