Skip to content

Commit

Permalink
Merge pull request #2 from seomoz/jfeng-mozsci-1
Browse files Browse the repository at this point in the history
Add early termination to ensemble model training and serialization/deserialization
  • Loading branch information
Matthew Peters committed Feb 20, 2013
2 parents 20cc8bc + 5c72420 commit 5d4a7bf
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 2 deletions.
1 change: 1 addition & 0 deletions mozsci/cspearmanr_by_fast.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <vector>
#include <iostream>
#include <algorithm>
#include <numeric>
#include <math.h>
#include <algorithm>
Expand Down
53 changes: 51 additions & 2 deletions mozsci/ems.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#

import numpy as np
import json

class EnsembleModelSelector(object):
"""Implements
Expand Down Expand Up @@ -71,8 +72,7 @@ def select_ensemble_bagged(self, y, ymodels, verbose=False):
self.nmodels = nmodels
self.ensemble_indices = np.arange(len(ymodels))[self.ensemble > 0.5]


def select_ensemble(self, y, ymodels):
def select_ensemble(self, y, ymodels, early_termination = False):
"""Y = actual y = (N, ) numpy array
ymodels = a list of predictions from different models.
len(ymodels) = nmodels
Expand All @@ -99,18 +99,27 @@ def select_ensemble(self, y, ymodels):
current_prediction /= float(self.nsort)
nmodels = self.nsort

if early_termination: last_error = np.finfo(np.float).max
# (2)
for k in xrange(self.niter):
# find the model that reduces error the most
# current_prediction is averaged over nmodels
# need to add in one more as a weighted average
errors = np.array([self.error(y, current_prediction * (float(nmodels) / (nmodels + 1)) + ypred.astype(np.float) / float(nmodels + 1)) for ypred in ymodels])

if early_termination:
min_error = errors.min()
if min_error < last_error: last_error = min_error
else:break

model_to_add = errors.argmin()

self.ensemble[model_to_add] += 1
current_prediction = current_prediction * (float(nmodels) / (nmodels + 1)) + ymodels[model_to_add].astype(np.float) / float(nmodels + 1)
nmodels += 1

print("Iteration %s, error=%s" % (k, errors.min()))

# pull out the indices of models included in the final ensemble
self.ensemble_indices = np.arange(len(ymodels))[self.ensemble > 0.5]
self.nmodels = nmodels
Expand All @@ -124,6 +133,46 @@ def pred(self, ymodels):
pred += ymodels[k] * self.ensemble[k]
return pred.astype(np.float) / np.float(self.nmodels)

def save_ensemble(self, fileout):
"""
Serialize the ensemble.
:param fileout: name of the file to write the json string, or a file object.
:return: None
"""
if self.ensemble is None or self.ensemble_indices is None:
raise ValueError('The ensemble has not been properly trained.')

model_json = {
'nmodels': self.nmodels,
'ensemble': self.ensemble[:].tolist(),
'ensemble_indices': self.ensemble_indices[:].tolist(),
}

# save to the file
if isinstance(fileout, basestring):
with open(fileout, 'w') as f:
json.dump(model_json, f)
else:
json.dump(model_json, fileout)

@classmethod
def load_ensemble(cls, model_json):
"""
Load the serialized model. Afteer the loading, we can use pred method on new data sets.
:param cls:
:param model_json: name of the file to read in the json string, or a file object.
:return: the new object.
"""
if isinstance(model_json, basestring):
with open(model_json, 'r') as f:
model_json = json.load(f)

ensemble = cls()
ensemble.nmodels = model_json['nmodels']
ensemble.ensemble = np.array(model_json['ensemble'], dtype = np.float64)
ensemble.ensemble_indices = np.array(model_json['ensemble_indices'], dtype = np.int)

return ensemble

if __name__ == "__main__":

Expand Down

0 comments on commit 5d4a7bf

Please sign in to comment.